diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 5c814742c9ac36..7b26ab8fa412e5 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -93,6 +93,7 @@ def __call__( # For backward compatibility legacy = kwargs.pop("legacy", True) if legacy: + # With `add_special_tokens=True`, the performance of donut are degraded when working with both images and text. logger.warning_once( "Legacy behavior is being used. The new behavior with legacy=False will be enabled in the future." "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` " diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index 2f8ee3229ff2cd..d60c76393f02bb 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -436,6 +436,7 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): "feature-extraction": BlipModel, "image-to-text": BlipForConditionalGeneration, "visual-question-answering": BlipForQuestionAnswering, + "image-text-to-text": BlipForConditionalGeneration, } if is_torch_available() else {} diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index f2ccb2da8dba94..8d914f202e3236 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -705,6 +705,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi "feature-extraction": Blip2Model, "image-to-text": Blip2ForConditionalGeneration, "visual-question-answering": Blip2ForConditionalGeneration, + "image-text-to-text": Blip2ForConditionalGeneration, } if is_torch_available() else {} diff --git a/tests/models/chameleon/test_modeling_chameleon.py b/tests/models/chameleon/test_modeling_chameleon.py index aad26ef147e83e..2adca2c6da0668 100644 --- a/tests/models/chameleon/test_modeling_chameleon.py +++ b/tests/models/chameleon/test_modeling_chameleon.py @@ -279,6 +279,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester { "feature-extraction": ChameleonModel, "text-generation": ChameleonForConditionalGeneration, + "image-text-to-text": ChameleonForConditionalGeneration, } if is_torch_available() else {} diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py index 9425bddb6f703c..4bd66ab945f441 100644 --- a/tests/models/fuyu/test_modeling_fuyu.py +++ b/tests/models/fuyu/test_modeling_fuyu.py @@ -265,7 +265,9 @@ def prepare_config_and_inputs_for_common(self): @require_torch class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (FuyuForCausalLM,) if is_torch_available() else () - pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {} + pipeline_model_mapping = ( + {"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {} + ) test_head_masking = False test_pruning = False diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 33da9e26cba03d..ccfb41459caf73 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -401,7 +401,12 @@ class GitModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, all_model_classes = (GitModel, GitForCausalLM) if is_torch_available() else () all_generative_model_classes = (GitForCausalLM,) if is_torch_available() else () pipeline_model_mapping = ( - {"feature-extraction": GitModel, "image-to-text": GitForCausalLM, "text-generation": GitForCausalLM} + { + "feature-extraction": GitModel, + "image-to-text": GitForCausalLM, + "text-generation": GitForCausalLM, + "image-text-to-text": GitForCausalLM, + } if is_torch_available() else {} ) diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index a49bce8d878fb4..4d1000b89422f7 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -323,7 +323,11 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): @require_torch class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (IdeficsModel, IdeficsForVisionText2Text) if is_torch_available() else () - pipeline_model_mapping = {"feature-extraction": IdeficsModel} if is_torch_available() else {} + pipeline_model_mapping = ( + {"feature-extraction": IdeficsModel, "image-text-to-text": IdeficsForVisionText2Text} + if is_torch_available() + else {} + ) test_pruning = False test_headmasking = False test_torchscript = False diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index e02c5b4c9f09c6..6908a9ad770c35 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -334,6 +334,7 @@ class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest """ all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": Idefics2ForConditionalGeneration} if is_torch_available() else () fx_compatible = False test_pruning = False test_resize_embeddings = True diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 550bb2785e0057..ff1d33f352ba19 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -321,6 +321,7 @@ class Idefics3ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTest """ all_model_classes = (Idefics3ForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": Idefics3ForConditionalGeneration} if is_torch_available() else () fx_compatible = False test_pruning = False test_resize_embeddings = True diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index 8292567334bf3b..01c43b6c4490e6 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -454,6 +454,7 @@ def prepare_config_and_inputs_for_common(self): @require_torch class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration} fx_compatible = False test_head_masking = False test_pruning = False diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 396a4179388fb8..423344b5a80ad6 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -248,7 +248,11 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase) all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else () all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else () pipeline_model_mapping = ( - {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration} + { + "feature-extraction": Kosmos2Model, + "image-to-text": Kosmos2ForConditionalGeneration, + "image-text-to-text": Kosmos2ForConditionalGeneration, + } if is_torch_available() else {} ) diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index e183c38a59f7d7..1da7b401ea7914 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -183,7 +183,11 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else () - pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {} + pipeline_model_mapping = ( + {"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration} + if is_torch_available() + else {} + ) test_pruning = False test_head_masking = False diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index a54aeab8a28252..ee41a7a504a62c 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -216,6 +216,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {} test_pruning = False test_head_masking = False diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 0e9c88cb3463fd..2b74fd584ac918 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -217,6 +217,9 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = ( + {"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {} + ) test_pruning = False test_head_masking = False diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 5c5ca3985ee08f..a0c96604d314be 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -271,6 +271,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else () test_pruning = False test_head_masking = False test_torchscript = False diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 644ac2cc5bd1b4..ee9f10354b7810 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -183,6 +183,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration} fx_compatible = False test_pruning = False test_torchscript = False diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py index 2d762008cbbc3d..8d8543eae7f85a 100644 --- a/tests/models/pix2struct/test_modeling_pix2struct.py +++ b/tests/models/pix2struct/test_modeling_pix2struct.py @@ -419,7 +419,11 @@ def prepare_config_and_inputs_for_common(self): @require_torch class Pix2StructModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): all_model_classes = (Pix2StructForConditionalGeneration,) if is_torch_available() else () - pipeline_model_mapping = {"image-to-text": Pix2StructForConditionalGeneration} if is_torch_available() else {} + pipeline_model_mapping = ( + {"image-to-text": Pix2StructForConditionalGeneration, "image-text-to-text": Pix2StructForConditionalGeneration} + if is_torch_available() + else {} + ) fx_compatible = False test_head_masking = False test_pruning = False diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 956243dccebebf..76951d07fce2fb 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -224,6 +224,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration} test_pruning = False test_head_masking = False diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index a3ae498606a379..d0e42a97d6d8d6 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -274,7 +274,11 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): else () ) all_generative_model_classes = (UdopForConditionalGeneration,) if is_torch_available() else () - pipeline_model_mapping = {"feature-extraction": UdopModel} if is_torch_available() else {} + pipeline_model_mapping = ( + {"feature-extraction": UdopModel, "image-text-to-text": UdopForConditionalGeneration} + if is_torch_available() + else {} + ) fx_compatible = False test_pruning = False test_torchscript = False diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index b12f2c30c774a0..70d6d22fc5eb58 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -164,6 +164,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else () all_generative_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else () + pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {} fx_compatible = False test_pruning = False test_resize_embeddings = True