diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index fa6594398a2343..c4be519b9c8be2 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -17,7 +17,6 @@ """ import logging -import warnings from typing import List, Optional, Union from ...feature_extraction_utils import BatchFeature @@ -97,12 +96,6 @@ def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_i image_token (`str`): The image token. num_images (`int`): Number of images in the prompt. """ - if image_token in prompt: - warnings.warn( - f"The image token {image_token} is already present in the prompt. No need to manually add {image_token} in the prompt for this model." - f" Removing all {image_token} and adding ({image_token}) * image_seq_len * num_images at the start of the prompt." - ) - prompt = prompt.replace(image_token, "") return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n" diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py index 4e6c82d723a560..0231b13732d2cb 100644 --- a/src/transformers/pipelines/image_text_to_text.py +++ b/src/transformers/pipelines/image_text_to_text.py @@ -85,6 +85,10 @@ def retrieve_images_in_chat(chat: dict, images: Optional[Union[str, List[str], " if isinstance(content, dict) and content.get("type") == "image": if "image" in content: retrieved_images.append(content["image"]) + elif "url" in content: + retrieved_images.append(content["url"]) + elif "path" in content: + retrieved_images.append(content["path"]) elif idx_images < len(images): retrieved_images.append(images[idx_images]) idx_images += 1 @@ -128,7 +132,7 @@ class ImageTextToTextPipeline(Pipeline): >>> "content": [ >>> { >>> "type": "image", - >>> "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + >>> "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", >>> }, >>> {"type": "text", "text": "Describe this image."}, >>> ], @@ -143,7 +147,7 @@ class ImageTextToTextPipeline(Pipeline): >>> pipe(text=messages, max_new_tokens=20, return_full_text=False) [{'input_text': [{'role': 'user', 'content': [{'type': 'image', - 'image': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, + 'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, {'type': 'text', 'text': 'Describe this image.'}]}, {'role': 'assistant', 'content': [{'type': 'text', 'text': 'There is a dog and'}]}], @@ -298,7 +302,7 @@ def __call__( if not isinstance(images, (list, tuple)): images = [images] if isinstance(text, str): - text = [text] * len(images) + text = [text] if not isinstance(text[0], str): raise ValueError("The pipeline does not support nested lists of prompts.") @@ -335,10 +339,18 @@ def __call__( images_reorganized.append(images[:num_images]) images = images[num_images:] images = images_reorganized - # After reorganizing, these should be the same - if len(images) != len(text): - raise ValueError("The number of images and text should be the same.") + elif len(text) == 1 and len(images) > 1: + logger.warning( + "The pipeline detected multiple images for one prompt, but no image tokens in the prompt. " + "The prompt will be repeated for each image." + ) + text = [text[0]] * len(images) + # After reorganizing, these should be the same + if len(text) > 1 and len(images) != len(text): + raise ValueError( + "Undefined behavior, please check the number of images and prompts, and nest the images to match the prompts." + ) return super().__call__([ImageText(image, text_single) for image, text_single in zip(images, text)], **kwargs) def preprocess( diff --git a/tests/pipelines/test_pipelines_image_text_to_text.py b/tests/pipelines/test_pipelines_image_text_to_text.py index 1e33436e9fda2a..c13514c2379183 100644 --- a/tests/pipelines/test_pipelines_image_text_to_text.py +++ b/tests/pipelines/test_pipelines_image_text_to_text.py @@ -172,7 +172,7 @@ def test_model_pt_chat_template_continue_final_message(self): "content": [ { "type": "image", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", }, {"type": "text", "text": "Describe this image."}, ], @@ -195,7 +195,7 @@ def test_model_pt_chat_template_continue_final_message(self): "content": [ { "type": "image", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", }, {"type": "text", "text": "Describe this image."}, ], @@ -208,7 +208,7 @@ def test_model_pt_chat_template_continue_final_message(self): "content": [ { "type": "image", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", }, {"type": "text", "text": "Describe this image."}, ], @@ -237,7 +237,7 @@ def test_model_pt_chat_template_new_text(self): "content": [ { "type": "image", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", }, {"type": "text", "text": "Describe this image."}, ], @@ -254,7 +254,7 @@ def test_model_pt_chat_template_new_text(self): "content": [ { "type": "image", - "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", }, {"type": "text", "text": "Describe this image."}, ],