diff --git a/docs/Evaluation.md b/docs/Evaluation.md index c56e4cb6c..af010ec1f 100644 --- a/docs/Evaluation.md +++ b/docs/Evaluation.md @@ -183,6 +183,28 @@ CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh ``` 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook. + +### HallusionBench + +1. Download the images zipped folder [`hallusion_bench.zip`](https://drive.google.com/file/d/1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0/view?usp=sharing) and the json file [`HallusionBench.json`](https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json) to `./playground/data/eval/hallusion-bench` with: + ```Shell + gdown https://drive.google.com/uc?id=1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0 + wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json + ``` +2. Unzip `hallusion_bench.zip` into `./playground/data/eval/hallusion-bench/images` with: + ```Shell + unzip hallusion_bench.zip -d ./playground/data/eval/hallusion-bench/images + ``` +3. Download the `evaluation.py` and `utils.py` scripts from the [HallusionBench GitHub](https://github.com/tianyi-lab/HallusionBench/tree/main) to `./playground/data/eval/hallusion-bench` using: + ```Shell + wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/evaluation.py + wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/utils.py + ``` +4. Multi-GPU inference: +```Shell +CUDA_VISIBLE_DEVICES=0,1 bash scripts/v1_5/eval/hallusionbench.sh +``` + ## More Benchmarks Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release. diff --git a/llava/eval/model_vqa_hallusionbench.py b/llava/eval/model_vqa_hallusionbench.py new file mode 100644 index 000000000..cdda77423 --- /dev/null +++ b/llava/eval/model_vqa_hallusionbench.py @@ -0,0 +1,134 @@ +import argparse +import torch +import os +import json +from tqdm import tqdm +import shortuuid + +from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN +from llava.conversation import conv_templates, SeparatorStyle +from llava.model.builder import load_pretrained_model +from llava.utils import disable_torch_init +from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path +from llava.eval.maya.eval_utils import load_maya_model + +from PIL import Image +import math + +# TODO: fix answer generation, as all results are 2 + +def split_list(lst, n): + """Split a list into n (roughly) equal-sized chunks""" + chunk_size = math.ceil(len(lst) / n) + return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)] + + +def get_chunk(lst, n, k): + chunks = split_list(lst, n) + return chunks[k] + + +def eval_model(args): + # Model + disable_torch_init() + model_path = os.path.expanduser(args.model_path) + model_name = get_model_name_from_path(model_path) + if 'maya' not in model_name: + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name) + else: + model, tokenizer, image_processor, context_len = load_maya_model(args.model_base, model_path, mode=args.mode) + + questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")] + questions = get_chunk(questions, args.num_chunks, args.chunk_idx) + answers_file = os.path.expanduser(args.answers_file) + os.makedirs(os.path.dirname(answers_file), exist_ok=True) + ans_file = open(answers_file, "w") + + for line in tqdm(questions): + idx = line["question_id"] + image_file = line.get("image") # May be None for text-only questions + qs = line["text"] + + # Always create a conversation, but handle image token differently + conv = conv_templates[args.conv_mode].copy() + + if image_file is None: + # Text-only question - don't add image token + conv.append_message(conv.roles[0], qs) + image_tensor = None + else: + # Image question - add image token + if model.config.mm_use_im_start_end: + qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs + else: + qs = DEFAULT_IMAGE_TOKEN + '\n' + qs + conv.append_message(conv.roles[0], qs) + + # Process image + image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB') + image_tensor = process_images([image], image_processor, model.config)[0].unsqueeze(0).half().cuda() + + # Add response placeholder and get prompt + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + # Tokenize input + input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() + + # Generate response + with torch.inference_mode(): + outputs = model.generate( + inputs=input_ids, + images=image_tensor if image_file is not None else None, + do_sample=True if args.temperature > 0 else False, + temperature=args.temperature, + max_new_tokens=30, + use_cache=True, + ) + + output_text = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)[0] + + # Convert output to HallusionBench format (0=No, 1=Yes, 2=Uncertain) + output_text = output_text.lower().strip() + if "yes" in output_text: + answer = "1" + elif "no" in output_text: + answer = "0" + else: + answer = "2" + + # Save result + ans_id = shortuuid.uuid() + ans_file.write(json.dumps({ + "question_id": idx, + "prompt": qs, + "text": answer, + "answer_id": ans_id, + "model_id": model_name, + "metadata": { + "raw_response": output_text, + "category": line.get("category"), + "subcategory": line.get("subcategory"), + "set_id": line.get("set_id"), + "figure_id": line.get("figure_id"), + } + }) + "\n") + ans_file.flush() + + ans_file.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--model-path", type=str, default="nahidalam/maya_full_ft") + parser.add_argument("--model-base", type=str, default="CohereForAI/aya-23-8B") + parser.add_argument("--mode", type=str, default="finetuned") + parser.add_argument("--image-folder", type=str, default="") + parser.add_argument("--question-file", type=str, default="tables/question.jsonl") + parser.add_argument("--answers-file", type=str, default="answer.jsonl") + parser.add_argument("--conv-mode", type=str, default="llava_v1") + parser.add_argument("--num-chunks", type=int, default=1) + parser.add_argument("--chunk-idx", type=int, default=0) + parser.add_argument("--temperature", type=float, default=0.2) + args = parser.parse_args() + + eval_model(args) diff --git a/scripts/convert_hallusionbench_for_eval.py b/scripts/convert_hallusionbench_for_eval.py new file mode 100644 index 000000000..5fe2800aa --- /dev/null +++ b/scripts/convert_hallusionbench_for_eval.py @@ -0,0 +1,48 @@ +import json +import os +from tqdm import tqdm + +def convert_for_eval(src_file, dst_file): + """Convert HallusionBench format to LLaVA format.""" + with open(src_file, 'r') as f: + data = json.load(f) + + converted = [] + for item in tqdm(data): + # Get image path if visual input is required + image_file = None + if int(item['visual_input']) != 0: + image_file = os.path.join( + item['category'], + item['subcategory'], + f"{item['set_id']}_{item['figure_id']}.png" + ) + + # Convert to LLaVA format + converted_item = { + "question_id": f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}", + "image": image_file, + "text": item['question'], + "category": item['category'], + "subcategory": item['subcategory'], + "set_id": item['set_id'], + "figure_id": item['figure_id'], + "visual_input": item['visual_input'], + "gt_answer": item['gt_answer'], + "gt_answer_details": item['gt_answer_details'] + } + converted.append(converted_item) + + # Write to JSONL format + with open(dst_file, 'w') as f: + for item in converted: + f.write(json.dumps(item) + '\n') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("--src", type=str, default="HallusionBench.json") + parser.add_argument("--dst", type=str, default="llava-hallusionbench.jsonl") + args = parser.parse_args() + + convert_for_eval(args.src, args.dst) diff --git a/scripts/convert_hallusionbench_result_for_eval.py b/scripts/convert_hallusionbench_result_for_eval.py new file mode 100644 index 000000000..f8d090719 --- /dev/null +++ b/scripts/convert_hallusionbench_result_for_eval.py @@ -0,0 +1,42 @@ +import json +import argparse +from tqdm import tqdm +def convert_result(data_file, src_file, dst_file): + """Convert LLaVA output format back to HallusionBench format.""" + # Read original HallusionBench data + with open(data_file, 'r') as f: + orig_data = json.load(f) + + # Create lookup dictionary + results = {} + with open(src_file, 'r') as f: + for line in f: + item = json.loads(line) + results[item['question_id']] = item['text'] + + # Add model predictions to original data + for item in tqdm(orig_data): + qid = f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}" + response = results.get(qid, "") + + # Convert response to required format ("0", "1", or "2") + response = response.lower().strip() + if "yes" in response: + item['model_prediction'] = "1" + elif "no" in response: + item['model_prediction'] = "0" + else: + item['model_prediction'] = "2" + + # Save results + with open(dst_file, 'w') as f: + json.dump(orig_data, f, indent=2) + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("--data", type=str, help="Original HallusionBench data") + parser.add_argument("--src", type=str, help="Source file with model outputs") + parser.add_argument("--dst", type=str, help="Destination file for evaluation") + args = parser.parse_args() + + convert_result(args.data, args.src, args.dst) diff --git a/scripts/maya/eval/hallusionbench.sh b/scripts/maya/eval/hallusionbench.sh new file mode 100644 index 000000000..5edf6cfc0 --- /dev/null +++ b/scripts/maya/eval/hallusionbench.sh @@ -0,0 +1,54 @@ +#!/bin/bash + +pip install prettytable openai==0.28 + +gpu_list="${CUDA_VISIBLE_DEVICES:-0}" +IFS=',' read -ra GPULIST <<< "$gpu_list" + +CHUNKS=${#GPULIST[@]} + +CKPT="maya_toxicity_free_finetuned" +# CKPT="maya_full_ft" +SPLIT="llava-hallusionbench" +EVAL_DIR="./playground/data/eval/hallusion-bench" + +mkdir -p ${EVAL_DIR}/answers/${CKPT} + +# Convert HallusionBench format to LLaVA format +python scripts/convert_hallusionbench_for_eval.py \ + --src ${EVAL_DIR}/HallusionBench.json \ + --dst ${EVAL_DIR}/${SPLIT}.jsonl + +# Run model on chunks +for IDX in $(seq 0 $((CHUNKS-1))); do + CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_hallusionbench \ + --model-path nahidalam/${CKPT} \ + --model-base CohereForAI/aya-23-8B \ + --question-file ${EVAL_DIR}/${SPLIT}.jsonl \ + --image-folder ${EVAL_DIR}/images \ + --answers-file ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl \ + --num-chunks $CHUNKS \ + --chunk-idx $IDX \ + --temperature 0 \ + --conv-mode aya & +done + +wait + +# Merge results +output_file=${EVAL_DIR}/answers/${CKPT}/merge.jsonl +> "$output_file" +for IDX in $(seq 0 $((CHUNKS-1))); do + cat ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file" +done + +mkdir -p ${EVAL_DIR}/results/${CKPT} +# Convert results back to HallusionBench format +python scripts/convert_hallusionbench_result_for_eval.py \ + --data ${EVAL_DIR}/HallusionBench.json \ + --src ${EVAL_DIR}/answers/${CKPT}/merge.jsonl \ + --dst ${EVAL_DIR}/results/${CKPT}/HallusionBench_result.json + +# Run evaluation +cd ${EVAL_DIR} +python evaluation.py --model ${CKPT}