huggingface · younesbelkada · Feb 20, 2024 · Jan 25, 2024 · Feb 6, 2024 · Jan 25, 2024
@@ -94,6 +94,10 @@ class BlipTextConfig(PretrainedConfig):
             Whether the model is used as a decoder.
         use_cache (`bool`, *optional*, defaults to `True`):
             Whether or not the model should return the last key/values attentions (not used by all models).
+        label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
 
     Example:
 
@@ -133,6 +137,7 @@ def __init__(
         sep_token_id=102,
         is_decoder=True,
         use_cache=True,
+        label_smoothing=0.0,
         **kwargs,
     ):
         super().__init__(
@@ -158,6 +163,7 @@ def __init__(
         self.attention_probs_dropout_prob = attention_probs_dropout_prob
         self.is_decoder = is_decoder
         self.use_cache = use_cache
+        self.label_smoothing = label_smoothing
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
@@ -298,6 +304,10 @@ class BlipConfig(PretrainedConfig):
             The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
         image_text_hidden_size (`int`, *optional*, defaults to 256):
             Dimentionality of the hidden state of the image-text fusion layer.
+        label_smoothing (float, optional, *optional*, defaults to 0.0): A float in [0.0, 1.0]. Specifies the amount
+            of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
         kwargs (*optional*):
             Dictionary of keyword arguments.
 
@@ -333,6 +343,7 @@ def __init__(
         projection_dim=512,
         logit_scale_init_value=2.6592,
         image_text_hidden_size=256,
+        label_smoothing=0.0,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -355,6 +366,7 @@ def __init__(
         self.initializer_factor = 1.0
         self.initializer_range = 0.02
         self.image_text_hidden_size = image_text_hidden_size
+        self.label_smoothing = label_smoothing
 
     @classmethod
     def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):

@@ -813,6 +813,7 @@ def __init__(self, config):
 
         self.bert = BlipTextModel(config, add_pooling_layer=False)
         self.cls = BlipTextOnlyMLMHead(config)
+        self.label_smoothing = config.label_smoothing
 
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
@@ -893,7 +894,7 @@ def forward(
             # we are doing next-token prediction; shift prediction scores and input ids by one
             shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
             labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
-            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
             lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
             if reduction == "none":
                 lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)

@@ -976,6 +976,7 @@ def __init__(self, config, **kwargs):
 
         self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
         self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
+        self.label_smoothing = config.label_smoothing
 
     def get_output_embeddings(self):
         return self.cls.predictions.decoder
@@ -1063,7 +1064,9 @@ def call(
             # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
             # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
             one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
-            loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
+            loss_fct = keras.losses.CategoricalCrossentropy(
+                from_logits=True, label_smoothing=self.label_smoothing, reduction="none"
+            )
             masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
             lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
             lm_loss *= masked_positions