diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index 52d2f1539a4867..d315ce612fda8f 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -505,3 +505,75 @@ def test_unstructured_kwargs(self): self.assertEqual(inputs["pixel_values"].shape[3], 32) self.assertEqual(len(inputs["input_ids"][0]), 120) + + @require_torch + @require_vision + def test_text_only_inference(self): + """Test that the processor works correctly with text-only input.""" + processor = self.get_processor() + + text = "This is a simple text without images." + inputs = processor(text=text) + + tokenized_sentence = processor.tokenizer(text, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"]] + + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertTrue("pixel_values" not in inputs) + self.assertTrue("pixel_attention_mask" not in inputs) + + # Test batch of texts without image tokens + texts = ["First text.", "Second piece of text."] + batch_inputs = processor(text=texts, padding=True) + + tokenized_1 = processor.tokenizer(texts[0], add_special_tokens=False) + tokenized_2 = processor.tokenizer(texts[1], add_special_tokens=False) + + expected_1 = [self.bos_token_id] + tokenized_1["input_ids"] + expected_2 = [self.bos_token_id] + tokenized_2["input_ids"] + + # Pad the shorter sequence + pad_len = len(expected_2) - len(expected_1) + if pad_len > 0: + padded_expected_1 = [self.padding_token_id] * pad_len + expected_1 + expected_attention_1 = [0] * pad_len + [1] * len(expected_1) + self.assertEqual(batch_inputs["input_ids"], [padded_expected_1, expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [expected_attention_1, [1] * len(expected_2)]) + else: + pad_len = -pad_len + padded_expected_2 = [self.padding_token_id] * pad_len + expected_2 + expected_attention_2 = [0] * pad_len + [1] * len(expected_2) + self.assertEqual(batch_inputs["input_ids"], [expected_1, padded_expected_2]) + self.assertEqual(batch_inputs["attention_mask"], [[1] * len(expected_1), expected_attention_2]) + + @require_torch + @require_vision + def test_missing_images_error(self): + """Test that appropriate error is raised when images are referenced but not provided.""" + processor = self.get_processor() + + # Test single text with image token but no image + text = "Let me show you this image: What do you think?" + with self.assertRaises(ValueError) as context: + processor(text=text) + self.assertTrue("Number of images" in str(context.exception)) + + # Test batch with image tokens but no images + texts = [ + "First text with token.", + "Second text with token.", + ] + with self.assertRaises(ValueError) as context: + processor(text=texts) + self.assertTrue("Number of images" in str(context.exception)) + + # Test with empty images list + with self.assertRaises(ValueError) as context: + processor(text=text, images=[]) + self.assertTrue("Number of images" in str(context.exception)) + + # Test with batch and empty images lists + with self.assertRaises(ValueError) as context: + processor(text=texts, images=[[], []]) + self.assertTrue("Number of images" in str(context.exception)) \ No newline at end of file