diff --git a/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml b/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml new file mode 100644 index 00000000..85b23377 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/audio2text/mix_evals_audio2text.yaml @@ -0,0 +1,3 @@ +group: mix_evals_audio2text +task: +- mix_evals_audio2_text_freeform diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml deleted file mode 100644 index e49612a8..00000000 --- a/lmms_eval/tasks/mix_evals/mix_evals_video2text.yaml +++ /dev/null @@ -1,5 +0,0 @@ -group: mix_evals_video2text -task: -# - mix_evals_video2text_openconv -- mix_evals_video2text_mc -- mix_evals_video2text_freeform \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/_default_template_yaml b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml similarity index 89% rename from lmms_eval/tasks/mix_evals/_default_template_yaml rename to lmms_eval/tasks/mix_evals/video2text/_default_template_yaml index bda3f8e8..73473c72 100644 --- a/lmms_eval/tasks/mix_evals/_default_template_yaml +++ b/lmms_eval/tasks/mix_evals/video2text/_default_template_yaml @@ -2,7 +2,7 @@ dataset_kwargs: cache_dir: mix_evals_video2text token: true video: true -dataset_path: lmms-lab/MixEvals_Video2Text +dataset_path: MixEval/MixEval-X lmms_eval_specific_kwargs: default: post_prompt: "" diff --git a/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml new file mode 100644 index 00000000..43fc1133 --- /dev/null +++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text.yaml @@ -0,0 +1,5 @@ +group: mix_evals_video2text +task: +- mix_evals_video2text_mc +- mix_evals_video2text_freeform +- mix_evals_video2text_openended \ No newline at end of file diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml similarity index 100% rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_freeform.yaml rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_freeform.yaml diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml similarity index 100% rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_mc.yaml rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_mc.yaml diff --git a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml similarity index 88% rename from lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml rename to lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml index a62b2818..7d0baea8 100644 --- a/lmms_eval/tasks/mix_evals/mix_evals_video2text_openended.yaml +++ b/lmms_eval/tasks/mix_evals/video2text/mix_evals_video2text_openended.yaml @@ -1,7 +1,7 @@ include: _default_template_yaml -dataset_name: "video2text_openended" -task: "mix_evals_video2text_openconv" -test_split: test +dataset_name: "open_ended" +task: "mix_evals_video2text_openended" +test_split: video2text output_type: generate_until doc_to_visual: !function utils.mix_evals_video2text_doc_to_visual doc_to_text: !function utils.mix_evals_video2text_doc_to_text_open_convs diff --git a/lmms_eval/tasks/mix_evals/utils.py b/lmms_eval/tasks/mix_evals/video2text/utils.py similarity index 91% rename from lmms_eval/tasks/mix_evals/utils.py rename to lmms_eval/tasks/mix_evals/video2text/utils.py index cd1d8e5b..40e545fd 100644 --- a/lmms_eval/tasks/mix_evals/utils.py +++ b/lmms_eval/tasks/mix_evals/video2text/utils.py @@ -115,18 +115,32 @@ def get_eval(model_response: str, ground_truth: str, max_tokens: int, retries: i cache_dir = os.path.join(cache_dir) -# Pass in video path here -# Can only work correctly with video llm +def mix_evals_doc_to_visual(doc, modality): + visual = [] + for video_path in doc["input_file"]: + video_path = os.path.join(cache_dir, video_path) + if os.path.exists(video_path): + video_path = video_path + elif os.path.exists(video_path.replace("mp4", "MP4")): + video_path = video_path.replace("mp4", "MP4") + else: + sys.exit(f"video path:{video_path} does not exist, please check") + + if modality == "video": + visual.append(video_path) + elif modality == "image": + visual.append(video_path) + else: + sys.exit(f"modality:{modality} is not supported, please check") + return visual + + def mix_evals_video2text_doc_to_visual(doc): - video_path = doc["video_path"] - video_path = os.path.join(cache_dir, video_path) - if os.path.exists(video_path): - video_path = video_path - elif os.path.exists(video_path.replace("mp4", "MP4")): - video_path = video_path.replace("mp4", "MP4") - else: - sys.exit(f"video path:{video_path} does not exist, please check") - return [video_path] + return mix_evals_doc_to_visual(doc, "video") + + +def mix_evals_image2text_doc_to_visual(doc): + return mix_evals_doc_to_visual(doc, "image") # This is the place where you format your question @@ -140,7 +154,7 @@ def mix_evals_video2text_doc_to_text(doc, lmms_eval_specific_kwargs=None): if "post_prompt" in lmms_eval_specific_kwargs: post_prompt = lmms_eval_specific_kwargs["post_prompt"] - user_prompt = doc["prompt"] + user_prompt = doc["query"] if "options" in doc: option_prompt = "Here are the options:\n"