Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Evals for HallusionBench #31

Open
wants to merge 2 commits into
base: maya_eval
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 22 additions & 0 deletions docs/Evaluation.md
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,28 @@ CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
```
3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.


### HallusionBench

1. Download the images zipped folder [`hallusion_bench.zip`](https://drive.google.com/file/d/1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0/view?usp=sharing) and the json file [`HallusionBench.json`](https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json) to `./playground/data/eval/hallusion-bench` with:
```Shell
gdown https://drive.google.com/uc?id=1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0
wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json
```
2. Unzip `hallusion_bench.zip` into `./playground/data/eval/hallusion-bench/images` with:
```Shell
unzip hallusion_bench.zip -d ./playground/data/eval/hallusion-bench/images
```
3. Download the `evaluation.py` and `utils.py` scripts from the [HallusionBench GitHub](https://github.com/tianyi-lab/HallusionBench/tree/main) to `./playground/data/eval/hallusion-bench` using:
```Shell
wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/evaluation.py
wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/utils.py
```
4. Multi-GPU inference:
```Shell
CUDA_VISIBLE_DEVICES=0,1 bash scripts/v1_5/eval/hallusionbench.sh
```

## More Benchmarks

Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.
Expand Down
134 changes: 134 additions & 0 deletions llava/eval/model_vqa_hallusionbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
import argparse
import torch
import os
import json
from tqdm import tqdm
import shortuuid

from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
from llava.conversation import conv_templates, SeparatorStyle
from llava.model.builder import load_pretrained_model
from llava.utils import disable_torch_init
from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
from llava.eval.maya.eval_utils import load_maya_model

from PIL import Image
import math

# TODO: fix answer generation, as all results are 2

def split_list(lst, n):
"""Split a list into n (roughly) equal-sized chunks"""
chunk_size = math.ceil(len(lst) / n)
return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]


def get_chunk(lst, n, k):
chunks = split_list(lst, n)
return chunks[k]


def eval_model(args):
# Model
disable_torch_init()
model_path = os.path.expanduser(args.model_path)
model_name = get_model_name_from_path(model_path)
if 'maya' not in model_name:
tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
else:
model, tokenizer, image_processor, context_len = load_maya_model(args.model_base, model_path, mode=args.mode)

questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
answers_file = os.path.expanduser(args.answers_file)
os.makedirs(os.path.dirname(answers_file), exist_ok=True)
ans_file = open(answers_file, "w")

for line in tqdm(questions):
idx = line["question_id"]
image_file = line.get("image") # May be None for text-only questions
qs = line["text"]

# Always create a conversation, but handle image token differently
conv = conv_templates[args.conv_mode].copy()

if image_file is None:
# Text-only question - don't add image token
conv.append_message(conv.roles[0], qs)
image_tensor = None
else:
# Image question - add image token
if model.config.mm_use_im_start_end:
qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
else:
qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
conv.append_message(conv.roles[0], qs)

# Process image
image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
image_tensor = process_images([image], image_processor, model.config)[0].unsqueeze(0).half().cuda()

# Add response placeholder and get prompt
conv.append_message(conv.roles[1], None)
prompt = conv.get_prompt()

# Tokenize input
input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

# Generate response
with torch.inference_mode():
outputs = model.generate(
inputs=input_ids,
images=image_tensor if image_file is not None else None,
do_sample=True if args.temperature > 0 else False,
temperature=args.temperature,
max_new_tokens=30,
use_cache=True,
)

output_text = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)[0]

# Convert output to HallusionBench format (0=No, 1=Yes, 2=Uncertain)
output_text = output_text.lower().strip()
if "yes" in output_text:
answer = "1"
elif "no" in output_text:
answer = "0"
else:
answer = "2"

# Save result
ans_id = shortuuid.uuid()
ans_file.write(json.dumps({
"question_id": idx,
"prompt": qs,
"text": answer,
"answer_id": ans_id,
"model_id": model_name,
"metadata": {
"raw_response": output_text,
"category": line.get("category"),
"subcategory": line.get("subcategory"),
"set_id": line.get("set_id"),
"figure_id": line.get("figure_id"),
}
}) + "\n")
ans_file.flush()

ans_file.close()

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--model-path", type=str, default="nahidalam/maya_full_ft")
parser.add_argument("--model-base", type=str, default="CohereForAI/aya-23-8B")
parser.add_argument("--mode", type=str, default="finetuned")
parser.add_argument("--image-folder", type=str, default="")
parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
parser.add_argument("--answers-file", type=str, default="answer.jsonl")
parser.add_argument("--conv-mode", type=str, default="llava_v1")
parser.add_argument("--num-chunks", type=int, default=1)
parser.add_argument("--chunk-idx", type=int, default=0)
parser.add_argument("--temperature", type=float, default=0.2)
args = parser.parse_args()

eval_model(args)
48 changes: 48 additions & 0 deletions scripts/convert_hallusionbench_for_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import json
import os
from tqdm import tqdm

def convert_for_eval(src_file, dst_file):
"""Convert HallusionBench format to LLaVA format."""
with open(src_file, 'r') as f:
data = json.load(f)

converted = []
for item in tqdm(data):
# Get image path if visual input is required
image_file = None
if int(item['visual_input']) != 0:
image_file = os.path.join(
item['category'],
item['subcategory'],
f"{item['set_id']}_{item['figure_id']}.png"
)

# Convert to LLaVA format
converted_item = {
"question_id": f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}",
"image": image_file,
"text": item['question'],
"category": item['category'],
"subcategory": item['subcategory'],
"set_id": item['set_id'],
"figure_id": item['figure_id'],
"visual_input": item['visual_input'],
"gt_answer": item['gt_answer'],
"gt_answer_details": item['gt_answer_details']
}
converted.append(converted_item)

# Write to JSONL format
with open(dst_file, 'w') as f:
for item in converted:
f.write(json.dumps(item) + '\n')

if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--src", type=str, default="HallusionBench.json")
parser.add_argument("--dst", type=str, default="llava-hallusionbench.jsonl")
args = parser.parse_args()

convert_for_eval(args.src, args.dst)
42 changes: 42 additions & 0 deletions scripts/convert_hallusionbench_result_for_eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
import json
import argparse
from tqdm import tqdm
def convert_result(data_file, src_file, dst_file):
"""Convert LLaVA output format back to HallusionBench format."""
# Read original HallusionBench data
with open(data_file, 'r') as f:
orig_data = json.load(f)

# Create lookup dictionary
results = {}
with open(src_file, 'r') as f:
for line in f:
item = json.loads(line)
results[item['question_id']] = item['text']

# Add model predictions to original data
for item in tqdm(orig_data):
qid = f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}"
response = results.get(qid, "")

# Convert response to required format ("0", "1", or "2")
response = response.lower().strip()
if "yes" in response:
item['model_prediction'] = "1"
elif "no" in response:
item['model_prediction'] = "0"
else:
item['model_prediction'] = "2"

# Save results
with open(dst_file, 'w') as f:
json.dump(orig_data, f, indent=2)

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("--data", type=str, help="Original HallusionBench data")
parser.add_argument("--src", type=str, help="Source file with model outputs")
parser.add_argument("--dst", type=str, help="Destination file for evaluation")
args = parser.parse_args()

convert_result(args.data, args.src, args.dst)
54 changes: 54 additions & 0 deletions scripts/maya/eval/hallusionbench.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash

pip install prettytable openai==0.28

gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
IFS=',' read -ra GPULIST <<< "$gpu_list"

CHUNKS=${#GPULIST[@]}

CKPT="maya_toxicity_free_finetuned"
# CKPT="maya_full_ft"
SPLIT="llava-hallusionbench"
EVAL_DIR="./playground/data/eval/hallusion-bench"

mkdir -p ${EVAL_DIR}/answers/${CKPT}

# Convert HallusionBench format to LLaVA format
python scripts/convert_hallusionbench_for_eval.py \
--src ${EVAL_DIR}/HallusionBench.json \
--dst ${EVAL_DIR}/${SPLIT}.jsonl

# Run model on chunks
for IDX in $(seq 0 $((CHUNKS-1))); do
CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_hallusionbench \
--model-path nahidalam/${CKPT} \
--model-base CohereForAI/aya-23-8B \
--question-file ${EVAL_DIR}/${SPLIT}.jsonl \
--image-folder ${EVAL_DIR}/images \
--answers-file ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl \
--num-chunks $CHUNKS \
--chunk-idx $IDX \
--temperature 0 \
--conv-mode aya &
done

wait

# Merge results
output_file=${EVAL_DIR}/answers/${CKPT}/merge.jsonl
> "$output_file"
for IDX in $(seq 0 $((CHUNKS-1))); do
cat ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
done

mkdir -p ${EVAL_DIR}/results/${CKPT}
# Convert results back to HallusionBench format
python scripts/convert_hallusionbench_result_for_eval.py \
--data ${EVAL_DIR}/HallusionBench.json \
--src ${EVAL_DIR}/answers/${CKPT}/merge.jsonl \
--dst ${EVAL_DIR}/results/${CKPT}/HallusionBench_result.json

# Run evaluation
cd ${EVAL_DIR}
python evaluation.py --model ${CKPT}