Skip to content

Commit

Permalink
MixEval-X Image / Video (#434)
Browse files Browse the repository at this point in the history
* reformat mix eval

* video2text fix

* video

* Add image-to-text evaluation tasks and templates

* Refactor image-to-text evaluation tasks and update configurations

* Enhance LlamaVision model with video loading improvements and configuration updates

* fix internvl2

* llava vid default numframe too small

* add max_new_tokens

* remove "with a short phrase" for `gpt4v`
  • Loading branch information
pufanyi authored Dec 3, 2024
1 parent dd2839e commit df4f745
Show file tree
Hide file tree
Showing 25 changed files with 1,036 additions and 332 deletions.
3 changes: 3 additions & 0 deletions lmms_eval/filters/extraction.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import re
import sys
import unicodedata

import openai

from lmms_eval.api.filter import Filter


Expand Down
4 changes: 2 additions & 2 deletions lmms_eval/models/internvl2.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,8 @@ def __init__(
super().__init__()

self.path = pretrained
self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True).eval().cuda()
self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True)
self._model = AutoModel.from_pretrained(self.path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, trust_remote_code=True, device_map=device_map).eval()
self._tokenizer = AutoTokenizer.from_pretrained(self.path, trust_remote_code=True, device_map=device_map)

batch_size = int(batch_size)
assert batch_size == 1, f"Batch size should be 1 for InternVL2, but got {batch_size}."
Expand Down
31 changes: 11 additions & 20 deletions lmms_eval/models/llama_vision.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from lmms_eval.api.instance import Instance
from lmms_eval.api.model import lmms
from lmms_eval.api.registry import register_model
from lmms_eval.models.model_utils.load_video import read_video_pyav_pil

warnings.filterwarnings("ignore")

Expand All @@ -25,33 +26,19 @@

@register_model("llama_vision")
class LlamaVision(lmms):
"""
Llava Model for Hugging Face Transformers: https://huggingface.co/docs/transformers/v4.39.3/en/model_doc/llava
Adapted from the InstructBLIP model in lmms_eval/models/instructblip.py
Example usage:
accelerate launch --num_processes=8 --main_process_port 12345 -m lmms_eval \
--model llava_hf \
--model_args pretrained=llava-hf/llava-1.5-7b-hf \
--tasks seedbench \
--batch_size 1 \
--output_path ./logs/ \
--log_samples
"""

def __init__(
self,
pretrained: str = "meta-llama/Llama-3.2-11B-Vision",
revision: str = "main",
device: str = "cuda",
dtype: Optional[Union[str, torch.dtype]] = "auto",
batch_size: int = 1,
trust_remote_code: Optional[bool] = False,
trust_remote_code: Optional[bool] = True,
attn_implementation: Optional[str] = None,
device_map: str = "",
max_frames_num: Optional[int] = 32,
fps: Optional[int] = None,
max_image_size: Optional[int] = None,
**kwargs,
) -> None:
super().__init__()
Expand All @@ -68,7 +55,9 @@ def __init__(
if isinstance(dtype, str) and dtype != "auto":
dtype = getattr(torch, dtype)

self.fps = fps
self.max_frames_num = max_frames_num
self.max_image_size = max_image_size
self._model = MllamaForConditionalGeneration.from_pretrained(pretrained, revision=revision, torch_dtype=dtype, device_map=self.device_map, trust_remote_code=trust_remote_code, attn_implementation=attn_implementation)
self.model.eval()
self.processor = AutoProcessor.from_pretrained(pretrained)
Expand Down Expand Up @@ -193,9 +182,11 @@ def generate_until(self, requests: List[Instance]) -> List[str]:

for visual in visuals:
if isinstance(visual, str):
frames = self.load_video(visual, self.max_frames_num)
frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
images.extend([to_pil_image(frame) for frame in frames])
frames = read_video_pyav_pil(visual, num_frm=self.max_frames_num, fps=self.fps, max_image_size=self.max_image_size)
images.extend(frames)
# frames = self.load_video(visual, self.max_frames_num)
# frames = torch.from_numpy(frames).permute(0, 3, 1, 2)
# images.extend([to_pil_image(frame) for frame in frames])
elif isinstance(visual, PIL.Image.Image):
images.append(visual)

Expand Down
2 changes: 1 addition & 1 deletion lmms_eval/models/llava_vid.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ def __init__(
conv_template="vicuna_v1",
use_cache=True,
truncate_context=False, # whether to truncate the context in generation, set it False for LLaVA-1.6
max_frames_num: int = 3,
max_frames_num: int = 20,
video_fps: int = 1,
mm_resampler_type: str = "spatial_pool",
mm_spatial_pool_stride: int = 2,
Expand Down
3 changes: 3 additions & 0 deletions lmms_eval/tasks/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,6 +417,8 @@ def _get_task_and_group(self, task_dir: str):
"yaml_path": yaml_path,
}
elif self._config_is_group(config):
if f.endswith("mix_evals_image2text.yaml"):
print(config)
# This is a group config
tasks_and_groups[config["group"]] = {
"type": "group",
Expand Down Expand Up @@ -477,6 +479,7 @@ def _get_task_and_group(self, task_dir: str):
else:
self.logger.debug(f"File {f} in {root} could not be loaded as a task or group")

print(tasks_and_groups["mix_evals_image2text"])
return tasks_and_groups


Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
group: mix_evals_audio2text
task:
- mix_evals_audio2_text_freeform
13 changes: 13 additions & 0 deletions lmms_eval/tasks/mix_evals/image2text/_default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
dataset_path: MixEval/MixEval-X
dataset_kwargs:
video: true # a bit confusing, but this is because the official uses path to store image data, so we need to load it as a video dataset
cache_dir: mix_evals_image2text
lmms_eval_specific_kwargs:
default:
post_prompt: ""
pre_prompt: ""
gpt4v:
post_prompt: ""
pre_prompt: ""
metadata:
version: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
group: mix_evals_image2text
task:
- mix_evals_image2text_mc
- mix_evals_image2text_freeform
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
task: "mix_evals_image2text_freeform"
dataset_name: "image2text"
test_split: free_form
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"
process_results: !function utils.mix_evals_image2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_image2text_gpt_eval
higher_is_better: true

generation_kwargs:
max_new_tokens: 1024

include: _default_template_yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
task: "mix_evals_image2text_freeform_hard"
dataset_name: "image2text"
test_split: free_form_hard
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"
process_results: !function utils.mix_evals_image2text_process_results_freeform
metric_list:
- metric: gpt_eval
aggregation: !function utils.mix_evals_image2text_gpt_eval
higher_is_better: true

generation_kwargs:
max_new_tokens: 1024

include: _default_template_yaml

lmms_eval_specific_kwargs:
default:
pre_prompt: "Please answer the following questions about the image."
post_prompt: ""
gpt4v:
pre_prompt: "Please answer the following questions about the image."
post_prompt: ""
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
group: mix_evals_image2text_hard
task:
- mix_evals_image2text_mc_hard
- mix_evals_image2text_freeform_hard
# - mix_evals_image2text_openended
23 changes: 23 additions & 0 deletions lmms_eval/tasks/mix_evals/image2text/mix_evals_image2text_mc.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
include: _default_template_yaml
task: "mix_evals_image2text_mc"
dataset_name: "image2text"
test_split: multiple_choice
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"

generation_kwargs:
max_new_tokens: 1024

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.GPTMultiChoiceFilter
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
include: _default_template_yaml
task: "mix_evals_image2text_mc_hard"
dataset_name: "image2text"
test_split: multiple_choice_hard
output_type: generate_until
doc_to_visual: !function utils.mix_evals_image2text_doc_to_visual
doc_to_text: !function utils.mix_evals_image2text_doc_to_text
doc_to_target: "{{reference_answer}}"

generation_kwargs:
max_new_tokens: 1024

metric_list:
- metric: exact_match
aggregation: mean
higher_is_better: true
ignore_case: true
ignore_punctuation: true

filter_list:
- name: "flexible-extract"
filter:
- function: !function utils.GPTMultiChoiceFilter
Loading

0 comments on commit df4f745

Please sign in to comment.