diff --git a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py index 8387edf3c1f977..bf9dbd951b5b06 100644 --- a/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py +++ b/src/transformers/models/omdet_turbo/modeling_omdet_turbo.py @@ -21,7 +21,7 @@ from dataclasses import dataclass from functools import lru_cache from pathlib import Path -from typing import Optional, Tuple, Union +from typing import List, Optional, Tuple, Union import torch import torch.nn.functional as F @@ -208,7 +208,10 @@ def load_cuda_kernels(): # Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention def multi_scale_deformable_attention( - value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor + value: Tensor, + value_spatial_shapes: Union[Tensor, List[Tuple]], + sampling_locations: Tensor, + attention_weights: Tensor, ) -> Tensor: batch_size, _, num_heads, hidden_dim = value.shape _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape diff --git a/tests/models/blip/test_processor_blip.py b/tests/models/blip/test_processor_blip.py index 4d22c6527c07b1..aa63855da43a24 100644 --- a/tests/models/blip/test_processor_blip.py +++ b/tests/models/blip/test_processor_blip.py @@ -152,7 +152,7 @@ def test_unstructured_kwargs_batched(self): self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 + image_input = self.prepare_image_inputs(batch_size=2) inputs = processor( text=input_str, images=image_input, diff --git a/tests/models/blip_2/test_processor_blip_2.py b/tests/models/blip_2/test_processor_blip_2.py index 7151be8ac71200..7eb5bedc2be7a7 100644 --- a/tests/models/blip_2/test_processor_blip_2.py +++ b/tests/models/blip_2/test_processor_blip_2.py @@ -17,7 +17,7 @@ import pytest -from transformers.testing_utils import require_torch, require_vision +from transformers.testing_utils import require_vision from transformers.utils import is_vision_available from ...test_processing_common import ProcessorTesterMixin @@ -139,30 +139,3 @@ def test_model_input_names(self): # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertCountEqual(list(inputs.keys()), ["input_ids", "pixel_values", "attention_mask"]) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 11) diff --git a/tests/models/bridgetower/test_processing_bridgetower.py b/tests/models/bridgetower/test_processor_bridgetower.py similarity index 93% rename from tests/models/bridgetower/test_processing_bridgetower.py rename to tests/models/bridgetower/test_processor_bridgetower.py index 19902a1cc57f3b..2ccfde803edb20 100644 --- a/tests/models/bridgetower/test_processing_bridgetower.py +++ b/tests/models/bridgetower/test_processor_bridgetower.py @@ -15,8 +15,6 @@ import tempfile import unittest -import numpy as np - from transformers.testing_utils import require_torch, require_vision from transformers.utils import is_vision_available @@ -24,8 +22,6 @@ if is_vision_available(): - from PIL import Image - from transformers import ( AutoProcessor, BridgeTowerImageProcessor, @@ -35,7 +31,7 @@ @require_vision -class Blip2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): +class BridgeTowerProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor_class = BridgeTowerProcessor def setUp(self): @@ -57,17 +53,6 @@ def get_image_processor(self, **kwargs): def tearDown(self): shutil.rmtree(self.tmpdirname) - def prepare_image_inputs(self): - """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, - or a list of PyTorch tensors if one specifies torchify=True. - """ - - image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] - - image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] - - return image_inputs - # Some kwargs tests are overriden from common tests to handle shortest_edge # and size_divisor behaviour @@ -149,7 +134,7 @@ def test_unstructured_kwargs_batched(self): self.skip_processor_without_typed_kwargs(processor) input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 + image_input = self.prepare_image_inputs(batch_size=2) inputs = processor( text=input_str, images=image_input, diff --git a/tests/models/donut/test_processing_donut.py b/tests/models/donut/test_processor_donut.py similarity index 64% rename from tests/models/donut/test_processing_donut.py rename to tests/models/donut/test_processor_donut.py index 87cdb41a02c7bb..cf720e17b0d9d5 100644 --- a/tests/models/donut/test_processing_donut.py +++ b/tests/models/donut/test_processor_donut.py @@ -18,10 +18,6 @@ import unittest from transformers import DonutImageProcessor, DonutProcessor, XLMRobertaTokenizerFast -from transformers.testing_utils import ( - require_torch, - require_vision, -) from ...test_processing_common import ProcessorTesterMixin @@ -65,30 +61,3 @@ def test_token2json(self): actual_json = self.processor.token2json(sequence) self.assertDictEqual(actual_json, expected_json) - - @require_torch - @require_vision - def test_unstructured_kwargs_batched(self): - if "image_processor" not in self.processor_class.attributes: - self.skipTest(f"image_processor attribute not present in {self.processor_class}") - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - if not tokenizer.pad_token: - tokenizer.pad_token = "[TEST_PAD]" - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - self.skip_processor_without_typed_kwargs(processor) - - input_str = ["lower newer", "upper older longer string"] - image_input = self.prepare_image_inputs() * 2 - inputs = processor( - text=input_str, - images=image_input, - return_tensors="pt", - crop_size={"height": 214, "width": 214}, - size={"height": 214, "width": 214}, - padding="longest", - max_length=76, - ) - self.assertEqual(inputs["pixel_values"].shape[2], 214) - - self.assertEqual(len(inputs["input_ids"][0]), 7) diff --git a/tests/models/fuyu/test_processing_fuyu.py b/tests/models/fuyu/test_processor_fuyu.py similarity index 100% rename from tests/models/fuyu/test_processing_fuyu.py rename to tests/models/fuyu/test_processor_fuyu.py diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py similarity index 100% rename from tests/models/idefics2/test_processing_idefics2.py rename to tests/models/idefics2/test_processor_idefics2.py diff --git a/tests/models/idefics3/test_processing_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py similarity index 100% rename from tests/models/idefics3/test_processing_idefics3.py rename to tests/models/idefics3/test_processor_idefics3.py diff --git a/tests/models/llava_onevision/test_processing_llava_onevision.py b/tests/models/llava_onevision/test_processor_llava_onevision.py similarity index 100% rename from tests/models/llava_onevision/test_processing_llava_onevision.py rename to tests/models/llava_onevision/test_processor_llava_onevision.py diff --git a/tests/models/musicgen/test_processing_musicgen.py b/tests/models/musicgen/test_processor_musicgen.py similarity index 100% rename from tests/models/musicgen/test_processing_musicgen.py rename to tests/models/musicgen/test_processor_musicgen.py diff --git a/tests/models/musicgen_melody/test_processor_musicgen_melody.py b/tests/models/musicgen_melody/test_processor_musicgen_melody.py index e00f31c495990f..04fb94c64c3da8 100644 --- a/tests/models/musicgen_melody/test_processor_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_processor_musicgen_melody.py @@ -50,7 +50,7 @@ def floats_list(shape, scale=1.0, rng=None, name=None): @require_torch @require_sentencepiece @require_torchaudio -# Copied from tests.models.musicgen.test_processing_musicgen.MusicgenProcessorTest with Musicgen->MusicgenMelody, Encodec->MusicgenMelody, padding_mask->attention_mask, input_values->input_features +# Copied from tests.models.musicgen.test_processor_musicgen.MusicgenProcessorTest with Musicgen->MusicgenMelody, Encodec->MusicgenMelody, padding_mask->attention_mask, input_values->input_features class MusicgenMelodyProcessorTest(unittest.TestCase): def setUp(self): # Ignore copy diff --git a/tests/models/paligemma/test_processing_paligemma.py b/tests/models/paligemma/test_processing_paligemma.py deleted file mode 100644 index 33b31507e17df2..00000000000000 --- a/tests/models/paligemma/test_processing_paligemma.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright 2024 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import shutil -import tempfile -import unittest - -from transformers import AutoProcessor, GemmaTokenizerFast, PaliGemmaProcessor -from transformers.testing_utils import require_read_token, require_vision -from transformers.utils import is_vision_available - -from ...test_processing_common import ProcessorTesterMixin - - -if is_vision_available(): - from transformers import SiglipImageProcessor - - -@require_vision -@require_read_token -class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase): - processor_class = PaliGemmaProcessor - - def setUp(self): - self.tmpdirname = tempfile.mkdtemp() - image_processor = SiglipImageProcessor(do_center_crop=False) - tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-7b") - image_processor.image_seq_length = 32 - - processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer) - processor.save_pretrained(self.tmpdirname) - - def get_tokenizer(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer - - def get_image_processor(self, **kwargs): - return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor - - def tearDown(self): - shutil.rmtree(self.tmpdirname) - - def test_text_with_image_tokens(self): - image_processor = self.get_component("image_processor") - tokenizer = self.get_component("tokenizer") - - processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) - text_multi_images = "Dummy text!" - text_single_image = "Dummy text!" - text_no_image = "Dummy text!" - - image = self.prepare_image_inputs()[0] - - out_noimage = processor(text=text_no_image, images=image, return_tensors="np") - out_singlimage = processor(text=text_single_image, images=image, return_tensors="np") - for k in out_noimage: - self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist()) - - out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np") - out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np") - - # We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text - with self.assertRaises(ValueError): - out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np") - - for k in out_noimage: - self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist()) - - text_batched = ["Dummy text!", "Dummy text!"] - text_batched_with_image = ["Dummy text!", "Dummy text!"] - out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np") - out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np") - out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np") - for k in out_noimage: - self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist()) diff --git a/tests/models/paligemma/test_processor_paligemma.py b/tests/models/paligemma/test_processor_paligemma.py index 60de913e53ae9b..245aff594125cf 100644 --- a/tests/models/paligemma/test_processor_paligemma.py +++ b/tests/models/paligemma/test_processor_paligemma.py @@ -16,7 +16,7 @@ import tempfile import unittest -from transformers import GemmaTokenizer +from transformers import GemmaTokenizer, PaliGemmaProcessor from transformers.testing_utils import get_tests_dir, require_torch, require_vision from transformers.utils import is_vision_available @@ -24,11 +24,7 @@ if is_vision_available(): - from transformers import ( - PaliGemmaProcessor, - SiglipImageProcessor, - is_vision_available, - ) + from transformers import SiglipImageProcessor SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") @@ -61,3 +57,37 @@ def test_image_seq_length(self): text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length" ) self.assertEqual(len(inputs["input_ids"][0]), 112 + 14) + + def test_text_with_image_tokens(self): + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + text_multi_images = "Dummy text!" + text_single_image = "Dummy text!" + text_no_image = "Dummy text!" + + image = self.prepare_image_inputs() + + out_noimage = processor(text=text_no_image, images=image, return_tensors="np") + out_singlimage = processor(text=text_single_image, images=image, return_tensors="np") + for k in out_noimage: + self.assertTrue(out_noimage[k].tolist() == out_singlimage[k].tolist()) + + out_multiimages = processor(text=text_multi_images, images=[image, image], return_tensors="np") + out_noimage = processor(text=text_no_image, images=[[image, image]], return_tensors="np") + + # We can't be sure what is users intention, whether user want "one text + two images" or user forgot to add the second text + with self.assertRaises(ValueError): + out_noimage = processor(text=text_no_image, images=[image, image], return_tensors="np") + + for k in out_noimage: + self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist()) + + text_batched = ["Dummy text!", "Dummy text!"] + text_batched_with_image = ["Dummy text!", "Dummy text!"] + out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np") + out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np") + out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np") + for k in out_noimage: + self.assertTrue(out_noimage[k].tolist() == out_images[k].tolist() == out_noimage_nested[k].tolist()) diff --git a/tests/models/qwen2_vl/test_processing_qwen2_vl.py b/tests/models/qwen2_vl/test_processor_qwen2_vl.py similarity index 100% rename from tests/models/qwen2_vl/test_processing_qwen2_vl.py rename to tests/models/qwen2_vl/test_processor_qwen2_vl.py