change default image-text-to-text model to llava onevision

huggingface · Oct 29, 2024 · ccbac25 · ccbac25
1 parent 64acc98
commit ccbac25
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 2 deletions.
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
@@ -392,7 +392,7 @@
         "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
         "default": {
             "model": {
-                "pt": ("Salesforce/blip-image-captioning-base", "89b09ea"),
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
             }
         },
         "type": "multimodal",

diff --git a/src/transformers/pipelines/image_text_to_text.py b/src/transformers/pipelines/image_text_to_text.py
@@ -237,7 +237,7 @@ def __call__(
                 The text to be used for generation. If a list of strings is passed, the length of the list should be the
                 same as the number of images. Text can also follow the chat format: a list of dictionaries where each
                 dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
-                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a dictionary
+                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
                 containing the text of the message and the type of the message. The type of the message can be either
                 'text' or 'image'. If the type is 'image', no text is needed.
             return_tensors (`bool`, *optional*, defaults to `False`):