From ccbac25fe4d716fdb2556e2b614d88e9e770f17c Mon Sep 17 00:00:00 2001
From: yonigozlan <yoni.gozlan@huggingface.co>
Date: Tue, 29 Oct 2024 14:50:07 +0000
Subject: [PATCH] change default image-text-to-text model to llava onevision

---
 src/transformers/pipelines/__init__.py           | 2 +-
 src/transformers/pipelines/image_text_to_text.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index b926f7e464f158..07156b3cf1dbe2 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -392,7 +392,7 @@
         "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
         "default": {
             "model": {
-                "pt": ("Salesforce/blip-image-captioning-base", "89b09ea"),
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
             }
         },
         "type": "multimodal",
diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
index 6c9ab04da42dfe..39738ffc385dbe 100644
--- a/src/transformers/pipelines/image_text_to_text.py
+++ b/src/transformers/pipelines/image_text_to_text.py
@@ -237,7 +237,7 @@ def __call__(
                 The text to be used for generation. If a list of strings is passed, the length of the list should be the
                 same as the number of images. Text can also follow the chat format: a list of dictionaries where each
                 dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
-                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a dictionary
+                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
                 containing the text of the message and the type of the message. The type of the message can be either
                 'text' or 'image'. If the type is 'image', no text is needed.
             return_tensors (`bool`, *optional*, defaults to `False`):