Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed nll with label_smoothing to just nll #28708

Merged
merged 7 commits into from
Feb 20, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions src/transformers/models/blip/configuration_blip.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ class BlipTextConfig(PretrainedConfig):
Whether the model is used as a decoder.
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
label_smoothing (float, optional): A float in [0.0, 1.0]. Specifies the amount
of smoothing when computing the loss, where 0.0 means no smoothing. The targets
become a mixture of the original ground truth and a uniform distribution as described in
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
nileshkokane01 marked this conversation as resolved.
Show resolved Hide resolved

Example:

Expand Down Expand Up @@ -133,6 +137,7 @@ def __init__(
sep_token_id=102,
is_decoder=True,
use_cache=True,
label_smoothing=0.0,
**kwargs,
):
super().__init__(
Expand All @@ -158,6 +163,7 @@ def __init__(
self.attention_probs_dropout_prob = attention_probs_dropout_prob
self.is_decoder = is_decoder
self.use_cache = use_cache
self.label_smoothing = label_smoothing

@classmethod
def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
Expand Down Expand Up @@ -298,6 +304,10 @@ class BlipConfig(PretrainedConfig):
The inital value of the *logit_scale* paramter. Default is used as per the original BLIP implementation.
image_text_hidden_size (`int`, *optional*, defaults to 256):
Dimentionality of the hidden state of the image-text fusion layer.
label_smoothing (float, optional, *optional*, defaults to 0.0): A float in [0.0, 1.0]. Specifies the amount
of smoothing when computing the loss, where 0.0 means no smoothing. The targets
become a mixture of the original ground truth and a uniform distribution as described in
`Rethinking the Inception Architecture for Computer Vision <https://arxiv.org/abs/1512.00567>`__. Default: :math:`0.0`.
nileshkokane01 marked this conversation as resolved.
Show resolved Hide resolved
kwargs (*optional*):
Dictionary of keyword arguments.

Expand Down Expand Up @@ -333,6 +343,7 @@ def __init__(
projection_dim=512,
logit_scale_init_value=2.6592,
image_text_hidden_size=256,
label_smoothing=0.0,
**kwargs,
):
super().__init__(**kwargs)
Expand All @@ -355,6 +366,7 @@ def __init__(
self.initializer_factor = 1.0
self.initializer_range = 0.02
self.image_text_hidden_size = image_text_hidden_size
self.label_smoothing = label_smoothing

@classmethod
def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs):
Expand Down
3 changes: 2 additions & 1 deletion src/transformers/models/blip/modeling_blip_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,7 @@ def __init__(self, config):

self.bert = BlipTextModel(config, add_pooling_layer=False)
self.cls = BlipTextOnlyMLMHead(config)
self.label_smoothing = config.label_smoothing

def get_output_embeddings(self):
return self.cls.predictions.decoder
Expand Down Expand Up @@ -893,7 +894,7 @@ def forward(
# we are doing next-token prediction; shift prediction scores and input ids by one
shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=0.1)
loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
if reduction == "none":
lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/blip/modeling_tf_blip_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -976,6 +976,7 @@ def __init__(self, config, **kwargs):

self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
self.label_smoothing = config.label_smoothing

def get_output_embeddings(self):
return self.cls.predictions.decoder
Expand Down Expand Up @@ -1063,7 +1064,9 @@ def call(
# Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
# Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
loss_fct = keras.losses.CategoricalCrossentropy(from_logits=True, label_smoothing=0.1, reduction="none")
loss_fct = keras.losses.CategoricalCrossentropy(
from_logits=True, label_smoothing=self.label_smoothing, reduction="none"
)
masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
lm_loss *= masked_positions
Expand Down