diff --git a/src/transformers/models/blip/modeling_blip.py b/src/transformers/models/blip/modeling_blip.py index 90018a8b98218a..27dbbee6c671ee 100644 --- a/src/transformers/models/blip/modeling_blip.py +++ b/src/transformers/models/blip/modeling_blip.py @@ -464,7 +464,8 @@ class BlipPreTrainedModel(PreTrainedModel): config_class = BlipConfig base_model_prefix = "blip" supports_gradient_checkpointing = True - _no_split_modules = ["BlipEncoderLayer"] + _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] + _skip_keys_device_placement = ["past_key_value"] def _init_weights(self, module): """Initialize the weights""" @@ -1010,7 +1011,8 @@ def forward( text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) # cosine similarity as logits - logit_scale = self.logit_scale.exp() + logit_scale = self.logit_scale.exp().to(device=text_embeds.device) + image_embeds = image_embeds.to(device=text_embeds.device, dtype=text_embeds.dtype) logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale logits_per_image = logits_per_text.t() diff --git a/src/transformers/models/blip/modeling_blip_text.py b/src/transformers/models/blip/modeling_blip_text.py index 97a4f523380bc5..db8ad939725aca 100644 --- a/src/transformers/models/blip/modeling_blip_text.py +++ b/src/transformers/models/blip/modeling_blip_text.py @@ -82,7 +82,6 @@ def forward( position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] if inputs_embeds is None: - input_ids = input_ids.to(self.word_embeddings.weight.device) inputs_embeds = self.word_embeddings(input_ids) embeddings = inputs_embeds