From e828a1b7abf3ce18483a7c46beea29af315de4f9 Mon Sep 17 00:00:00 2001 From: ydshieh Date: Thu, 18 Apr 2024 15:51:53 +0200 Subject: [PATCH] fix --- .../models/clip/convert_clip_original_pytorch_to_hf.py | 3 ++- src/transformers/models/clip/modeling_clip.py | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py index 2127da4f6cf902..41b45d50209974 100644 --- a/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py +++ b/src/transformers/models/clip/convert_clip_original_pytorch_to_hf.py @@ -124,7 +124,8 @@ def convert_clip_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa copy_vison_model_and_projection(hf_model, pt_model) hf_model.logit_scale = pt_model.logit_scale - input_ids = torch.arange(0, 77).unsqueeze(0) + # Use a `eos_token` so the example is more meaningful + input_ids = torch.tensor([[config.text_config.bos_token_id] + list(range(3, 77)) + [config.text_config.eos_token_id] + [config.text_config.pad_token_id]]) pixel_values = torch.randn(1, 3, 224, 224) hf_outputs = hf_model(input_ids=input_ids, pixel_values=pixel_values, return_dict=True) diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py index a4ce51625ebf76..3257f27c59df02 100644 --- a/src/transformers/models/clip/modeling_clip.py +++ b/src/transformers/models/clip/modeling_clip.py @@ -734,6 +734,7 @@ def forward( pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), # We need to get the first position of `eos_token_id` value (`pad_token_ids` might equal to `eos_token_id`) + # Note: we assume the input always has a eos token in each text (i.e. always prepared by clip tokenizer) (input_ids.to(dtype=torch.int, device=last_hidden_state.device) == self.eos_token_id) .int() .argmax(dim=-1),