mlfoundations · olo126 · May 2, 2024 · May 9, 2024 · May 9, 2024 · May 25, 2024
diff --git a/open_flamingo/eval/eval_datasets.py b/open_flamingo/eval/eval_datasets.py
@@ -14,6 +14,8 @@
     "okvqa",
     "vizwiz",
     "textvqa",
+    "gqa",
+    "mantiseval",
     "hateful_memes",
     "imagenet",
 ]
@@ -104,16 +106,28 @@ def get_img_path(self, question):
             )
         elif self.dataset_name == "vizwiz":
             return os.path.join(self.image_dir_path, question["image_id"])
-        elif self.dataset_name == "textvqa":
+        elif self.dataset_name == "textvqa" or self.dataset_name == "gqa":
             return os.path.join(self.image_dir_path, f"{question['image_id']}.jpg")
+        elif self.dataset_name == "mantiseval":
+            img_paths = []
+            for img_id in question['image_id']:
+                img_paths.append(os.path.join(self.image_dir_path, f"{img_id}.jpg"))
+            return img_paths
         else:
             raise Exception(f"Unknown VQA dataset {self.dataset_name}")
 
     def __getitem__(self, idx):
         question = self.questions[idx]
         img_path = self.get_img_path(question)
-        image = Image.open(img_path)
-        image.load()
+        if self.dataset_name == "mantiseval":
+            image = []
+            for path in img_path:
+                img = Image.open(path)
+                img.load()
+                image.append(img)
+        else:
+            image = Image.open(img_path)
+            image.load()
         results = {
             "image": image,
             "question": question["question"],

diff --git a/open_flamingo/eval/eval_models/blip.py b/open_flamingo/eval/eval_models/blip.py
@@ -5,7 +5,7 @@
 
 from transformers import Blip2Processor, Blip2ForConditionalGeneration
 from eval_models.eval_model import BaseEvalModel
-from utils import unwrap_model
+from utils import unwrap_model, combine_images
 from transformers.modeling_outputs import CausalLMOutputWithPast
 
 
@@ -27,9 +27,14 @@ def required_args(self):
 
     def prepare_images(self, batch: List[List[Image.Image]]) -> torch.Tensor:
         batch_images = None
+        for i in range(len(batch)):
+            if len(batch[i]) > 1:
+                batch[i] = combine_images(batch[i])
+        """
         assert all(
             len(example) == 1 for example in batch
         ), "BLIP-2 only supports one image per example"
+        """
         for example in batch:
             if batch_images is None:
                 batch_images = self.processor.image_processor(
@@ -108,6 +113,12 @@ def get_vizwiz_prompt(self, question, answer=None) -> str:
 
     def get_textvqa_prompt(self, question, answer=None) -> str:
         return f"Question:{question} Short answer:{answer if answer is not None else ''}"
+
+    def get_gqa_prompt(self, question, answer=None) -> str:
+        return f"Question:{question} Short answer:{answer if answer is not None else ''}"
+
+    def get_mantiseval_prompt(self, question, answer=None) -> str:
+        return f"Question:{question} Short answer:{answer if answer is not None else ''}"
 
     def get_coco_prompt(self, caption=None) -> str:
         return f"A photo of {caption if caption is not None else ''}"

diff --git a/open_flamingo/eval/eval_models/open_flamingo.py b/open_flamingo/eval/eval_models/open_flamingo.py
@@ -287,6 +287,12 @@ def get_vizwiz_prompt(self, question, answer=None) -> str:
 
     def get_textvqa_prompt(self, question, answer=None) -> str:
         return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
+
+    def get_gqa_prompt(self, question, answer=None) -> str:
+        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
+
+    def get_mantiseval_prompt(self, question, answer=None) -> str:
+        return f"<image>Question:{question} Short answer:{answer if answer is not None else ''}{'<|endofchunk|>' if answer is not None else ''}"
 
     def get_coco_prompt(self, caption=None) -> str:
         return f"<image>Output:{caption if caption is not None else ''}{'<|endofchunk|>' if caption is not None else ''}"