From 61ac161a9d67dd6e7a7a971d8828f08fb127a2c6 Mon Sep 17 00:00:00 2001 From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Date: Tue, 1 Oct 2024 23:52:03 +0200 Subject: [PATCH] Add support for custom inputs and batched inputs in ProcessorTesterMixin (#33711) * add support for custom inputs and batched inputs in ProcessorTesterMixin * Fix batch_size behavior ProcessorTesterMixin * Change format prepare inputs batched * Remove override test pixtral processor * Remove unnecessary tests and cleanup after new prepare_inputs functions * Fix instructBlipVideo image processor --- .../image_processing_instructblipvideo.py | 7 +- tests/models/fuyu/test_processing_fuyu.py | 12 +- .../idefics3/test_processing_idefics3.py | 70 +++----- .../models/kosmos2/test_processor_kosmos2.py | 12 +- .../omdet_turbo/test_processor_omdet_turbo.py | 166 +----------------- .../pix2struct/test_processor_pix2struct.py | 22 +-- .../models/pixtral/test_processor_pixtral.py | 33 +--- tests/test_processing_common.py | 42 +++-- 8 files changed, 95 insertions(+), 269 deletions(-) diff --git a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py index 131b8fe57bd665..b83df54785fa14 100644 --- a/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/image_processing_instructblipvideo.py @@ -57,8 +57,11 @@ def make_batched_videos(videos) -> List[VideoInput]: elif len(videos[0].shape) == 4: return [list(video) for video in videos] - elif is_valid_image(videos) and len(videos.shape) == 4: - return [list(videos)] + elif is_valid_image(videos): + if isinstance(videos, PIL.Image.Image): + return [[videos]] + elif len(videos.shape) == 4: + return [list(videos)] raise ValueError(f"Could not make batched video from {videos}") diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processing_fuyu.py index 69a1d53e86f766..39a47293040bdd 100644 --- a/tests/models/fuyu/test_processing_fuyu.py +++ b/tests/models/fuyu/test_processing_fuyu.py @@ -190,7 +190,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # Fuyu uses tokenizer kwargs only when image is None. image_input = None @@ -218,7 +218,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # Fuyu uses tokenizer kwargs only when image is None. image_input = None @@ -237,7 +237,7 @@ def test_structured_kwargs_nested(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # Fuyu uses tokenizer kwargs only when image is None. image_input = None @@ -264,7 +264,7 @@ def test_structured_kwargs_nested_from_dict(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # Fuyu uses tokenizer kwargs only when image is None. image_input = None @@ -290,7 +290,7 @@ def test_unstructured_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # Fuyu uses tokenizer kwargs only when image is None. image_input = None inputs = processor( @@ -315,7 +315,7 @@ def test_unstructured_kwargs_batched(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer", "upper older longer string"] + input_str = self.prepare_text_inputs(batch_size=2) # Fuyu uses tokenizer kwargs only when image is None. image_input = None inputs = processor( diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processing_idefics3.py index 92f0fbb1f8e268..a53109b02b6951 100644 --- a/tests/models/idefics3/test_processing_idefics3.py +++ b/tests/models/idefics3/test_processing_idefics3.py @@ -17,6 +17,7 @@ import tempfile import unittest from io import BytesIO +from typing import Optional import numpy as np import requests @@ -284,44 +285,29 @@ def test_apply_chat_template(self): ) self.assertEqual(rendered, expected_rendered) - @require_torch - @require_vision - def test_image_processor_defaults_preserved_by_image_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117) + # Override as Idefics3Processor needs image tokens in prompts + def prepare_text_inputs(self, batch_size: Optional[int] = None): + if batch_size is None: + return "lower newer " - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer " - image_input = self.prepare_image_inputs() + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 3) - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 364) # crop size doesn't affect our image processor - - @require_torch - @require_vision - def test_kwargs_overrides_default_image_processor_kwargs(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component( - "image_processor", max_image_size={"longest_edge": 32}, size={"longest_edge": 32} + if batch_size == 1: + return ["lower newer "] + return ["lower newer ", " upper older longer string"] + [" lower newer"] * ( + batch_size - 2 ) - tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, image_seq_len=2) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer " - image_input = self.prepare_image_inputs() - - inputs = processor(text=input_str, images=image_input) - self.assertEqual(len(inputs["pixel_values"][0][0]), 3) - self.assertEqual(len(inputs["pixel_values"][0][0][0]), 32) - self.assertEqual(len(inputs["input_ids"][0]), 117) + # Override as Idefics3Processor needs nested images to work properly with batched inputs + @require_vision + def prepare_image_inputs(self, batch_size: Optional[int] = None): + """This function prepares a list of PIL images for testing""" + if batch_size is None: + return super().prepare_image_inputs() + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + return [[super().prepare_image_inputs()]] * batch_size @require_vision @require_torch @@ -333,7 +319,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30) @@ -350,7 +336,7 @@ def test_structured_kwargs_nested(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -378,7 +364,7 @@ def test_structured_kwargs_nested_from_dict(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -402,7 +388,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") @@ -419,11 +405,11 @@ def test_unstructured_kwargs_batched(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() + input_str = self.prepare_text_inputs(batch_size=2) + image_input = self.prepare_image_inputs(batch_size=2) inputs = processor( text=input_str, - images=[image_input, image_input], + images=image_input, return_tensors="pt", padding="longest", max_length=76, @@ -446,7 +432,7 @@ def test_unstructured_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor( text=input_str, diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py index 8de398ade70c71..8874c7d1d30e03 100644 --- a/tests/models/kosmos2/test_processor_kosmos2.py +++ b/tests/models/kosmos2/test_processor_kosmos2.py @@ -499,7 +499,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # set image input to None image_input = None @@ -525,7 +525,7 @@ def test_structured_kwargs_nested(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -551,7 +551,7 @@ def test_structured_kwargs_nested_from_dict(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -574,7 +574,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # set image input to None image_input = None @@ -593,7 +593,7 @@ def test_unstructured_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() # set image input to None image_input = None inputs = processor( @@ -618,7 +618,7 @@ def test_unstructured_kwargs_batched(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer", "upper older longer string"] + input_str = self.prepare_text_inputs(batch_size=2) # set image input to None image_input = None inputs = processor( diff --git a/tests/models/omdet_turbo/test_processor_omdet_turbo.py b/tests/models/omdet_turbo/test_processor_omdet_turbo.py index e6e2a1f50c52cd..52e1926e50b22f 100644 --- a/tests/models/omdet_turbo/test_processor_omdet_turbo.py +++ b/tests/models/omdet_turbo/test_processor_omdet_turbo.py @@ -17,7 +17,6 @@ import tempfile import unittest -import numpy as np import pytest from transformers import AutoProcessor, CLIPTokenizerFast, OmDetTurboProcessor @@ -36,8 +35,6 @@ from transformers.models.omdet_turbo.modeling_omdet_turbo import OmDetTurboObjectDetectionOutput if is_vision_available(): - from PIL import Image - from transformers import DetrImageProcessor @@ -45,6 +42,7 @@ @require_vision class OmDetTurboProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = OmDetTurboProcessor + text_input_name = "classes_input_ids" def setUp(self): self.tmpdirname = tempfile.mkdtemp() @@ -77,17 +75,6 @@ def get_image_processor(self, **kwargs): def tearDown(self): shutil.rmtree(self.tmpdirname) - def prepare_image_inputs(self): - """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, - or a list of PyTorch tensors if one specifies torchify=True. - """ - - image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] - - image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] - - return image_inputs - def get_fake_omdet_turbo_output(self): torch.manual_seed(42) return OmDetTurboObjectDetectionOutput( @@ -210,154 +197,3 @@ def test_model_input_names(self): inputs = processor(images=image_input, text=input_classes, task=input_tasks, return_tensors="pt") self.assertListEqual(list(inputs.keys()), self.input_keys) - - @require_vision - @require_torch - def test_tokenizer_defaults_preserved_by_kwargs(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt") - - self.assertEqual(len(inputs["tasks_input_ids"][0]), 117) - self.assertEqual(len(inputs["classes_input_ids"][0]), 117) - - @require_vision - @require_torch - def test_kwargs_overrides_default_tokenizer_kwargs(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer", max_length=117) - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor(images=image_input, text=[input_str], task=input_str, return_tensors="pt", max_length=112) - - self.assertEqual(len(inputs["tasks_input_ids"][0]), 112) - self.assertEqual(len(inputs["classes_input_ids"][0]), 112) - - @require_torch - @require_vision - def test_unstructured_kwargs(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - inputs = processor( - images=image_input, - text=[input_str], - task=input_str, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="max_length", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - self.assertEqual(len(inputs["tasks_input_ids"][0]), 76) - self.assertEqual(len(inputs["classes_input_ids"][0]), 76) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - images=image_input, - text=[input_str], - task=input_str, - return_tensors="pt", - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["tasks_input_ids"][0]), 6) - self.assertEqual(len(inputs["classes_input_ids"][0]), 6) - - @require_torch - @require_vision - def test_structured_kwargs_nested(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str}, - } - - inputs = processor(images=image_input, text=[input_str], **all_kwargs) - self.skip_processor_without_typed_kwargs(processor) - - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["tasks_input_ids"][0]), 76) - self.assertEqual(len(inputs["classes_input_ids"][0]), 76) - - @require_torch - @require_vision - def test_structured_kwargs_nested_from_dict(self): - # Rewrite as OmDet-Turbo processor outputs "input_ids" for both tasks and classes. - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" - image_input = self.prepare_image_inputs() - - # Define the kwargs for each modality - all_kwargs = { - "common_kwargs": {"return_tensors": "pt"}, - "images_kwargs": {"size": {"height": 214, "width": 214}}, - "text_kwargs": {"padding": "max_length", "max_length": 76, "task": input_str}, - } - - inputs = processor(images=image_input, text=[input_str], **all_kwargs) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["tasks_input_ids"][0]), 76) - self.assertEqual(len(inputs["classes_input_ids"][0]), 76) diff --git a/tests/models/pix2struct/test_processor_pix2struct.py b/tests/models/pix2struct/test_processor_pix2struct.py index ac8d4822f1c09f..f832ffd2d64f7c 100644 --- a/tests/models/pix2struct/test_processor_pix2struct.py +++ b/tests/models/pix2struct/test_processor_pix2struct.py @@ -96,7 +96,7 @@ def test_tokenizer(self): processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() encoded_processor = processor(text=input_str) @@ -111,7 +111,7 @@ def test_processor(self): processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) @@ -130,7 +130,7 @@ def test_processor_max_patches(self): processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) @@ -168,7 +168,7 @@ def test_model_input_names(self): processor = Pix2StructProcessor(tokenizer=tokenizer, image_processor=image_processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) @@ -195,7 +195,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input) @@ -213,7 +213,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, max_patches=1024) @@ -231,7 +231,7 @@ def test_unstructured_kwargs(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor( text=input_str, @@ -257,8 +257,8 @@ def test_unstructured_kwargs_batched(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 + input_str = self.prepare_text_inputs(batch_size=2) + image_input = self.prepare_image_inputs(batch_size=2) inputs = processor( text=input_str, images=image_input, @@ -284,7 +284,7 @@ def test_structured_kwargs_nested(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -313,7 +313,7 @@ def test_structured_kwargs_nested_from_dict(self): processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality diff --git a/tests/models/pixtral/test_processor_pixtral.py b/tests/models/pixtral/test_processor_pixtral.py index 59c19eabcaf53b..8cdbf93c6476b8 100644 --- a/tests/models/pixtral/test_processor_pixtral.py +++ b/tests/models/pixtral/test_processor_pixtral.py @@ -14,6 +14,7 @@ import shutil import tempfile import unittest +from typing import Optional import requests import torch @@ -246,27 +247,11 @@ def test_processor_with_multiple_images_multiple_lists(self): # fmt: on # Override as PixtralProcessor needs nested images to work properly with batched inputs - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - processor_components = self.prepare_components() - processor = self.processor_class(**processor_components) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = [self.prepare_image_inputs()] * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - do_rescale=True, - rescale_factor=-1, - padding="longest", - max_length=76, - ) - - self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) - self.assertTrue( - len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1]) - and len(inputs[self.text_input_name][1]) < 76 - ) + @require_vision + def prepare_image_inputs(self, batch_size: Optional[int] = None): + """This function prepares a list of PIL images for testing""" + if batch_size is None: + return super().prepare_image_inputs() + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + return [[super().prepare_image_inputs()]] * batch_size diff --git a/tests/test_processing_common.py b/tests/test_processing_common.py index 8cc71147c22013..187cf50c733cb6 100644 --- a/tests/test_processing_common.py +++ b/tests/test_processing_common.py @@ -17,6 +17,7 @@ import inspect import json import tempfile +from typing import Optional import numpy as np @@ -86,10 +87,25 @@ def get_processor(self): processor = self.processor_class(**components, **self.prepare_processor_dict()) return processor + def prepare_text_inputs(self, batch_size: Optional[int] = None): + if batch_size is None: + return "lower newer" + + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + + if batch_size == 1: + return ["lower newer"] + return ["lower newer", "upper older longer string"] + ["lower newer"] * (batch_size - 2) + @require_vision - def prepare_image_inputs(self): + def prepare_image_inputs(self, batch_size: Optional[int] = None): """This function prepares a list of PIL images for testing""" - return prepare_image_inputs() + if batch_size is None: + return prepare_image_inputs()[0] + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + return prepare_image_inputs() * batch_size @require_vision def prepare_video_inputs(self): @@ -148,7 +164,7 @@ def test_tokenizer_defaults_preserved_by_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") self.assertEqual(inputs[self.text_input_name].shape[-1], 117) @@ -170,7 +186,7 @@ def test_image_processor_defaults_preserved_by_image_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, return_tensors="pt") @@ -184,7 +200,7 @@ def test_kwargs_overrides_default_tokenizer_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor( text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" @@ -203,7 +219,7 @@ def test_kwargs_overrides_default_image_processor_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor(text=input_str, images=image_input, do_rescale=True, rescale_factor=-1, return_tensors="pt") @@ -216,7 +232,7 @@ def test_unstructured_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() inputs = processor( text=input_str, @@ -238,8 +254,8 @@ def test_unstructured_kwargs_batched(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 + input_str = self.prepare_text_inputs(batch_size=2) + image_input = self.prepare_image_inputs(batch_size=2) inputs = processor( text=input_str, images=image_input, @@ -263,7 +279,7 @@ def test_doubly_passed_kwargs(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = ["lower newer"] + input_str = [self.prepare_text_inputs()] image_input = self.prepare_image_inputs() with self.assertRaises(ValueError): _ = processor( @@ -281,7 +297,7 @@ def test_structured_kwargs_nested(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -303,7 +319,7 @@ def test_structured_kwargs_nested_from_dict(self): processor_components = self.prepare_components() processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() # Define the kwargs for each modality @@ -326,7 +342,7 @@ def test_overlapping_text_kwargs_handling(self): processor = self.processor_class(**processor_components) self.skip_processor_without_typed_kwargs(processor) - input_str = "lower newer" + input_str = self.prepare_text_inputs() image_input = self.prepare_image_inputs() with self.assertRaises(ValueError):