diff --git a/docs/Evaluation.md b/docs/Evaluation.md
index c56e4cb6c..af010ec1f 100644
--- a/docs/Evaluation.md
+++ b/docs/Evaluation.md
@@ -183,6 +183,28 @@ CUDA_VISIBLE_DEVICES=0 bash scripts/v1_5/eval/mmvet.sh
 ```
 3. Evaluate the predictions in `./playground/data/eval/mmvet/results` using the official jupyter notebook.
 
+
+### HallusionBench
+
+1. Download the images zipped folder [`hallusion_bench.zip`](https://drive.google.com/file/d/1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0/view?usp=sharing) and the json file [`HallusionBench.json`](https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json) to `./playground/data/eval/hallusion-bench` with:
+    ```Shell
+    gdown https://drive.google.com/uc?id=1eeO1i0G9BSZTE1yd5XeFwmrbe1hwyf_0
+    wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/HallusionBench.json
+    ```
+2. Unzip `hallusion_bench.zip` into `./playground/data/eval/hallusion-bench/images` with:
+    ```Shell
+    unzip hallusion_bench.zip -d ./playground/data/eval/hallusion-bench/images
+    ```
+3. Download the `evaluation.py` and `utils.py` scripts from the [HallusionBench GitHub](https://github.com/tianyi-lab/HallusionBench/tree/main) to `./playground/data/eval/hallusion-bench` using:
+    ```Shell
+    wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/evaluation.py
+    wget https://raw.githubusercontent.com/tianyi-lab/HallusionBench/refs/heads/main/utils.py
+    ```
+4. Multi-GPU inference:
+```Shell
+CUDA_VISIBLE_DEVICES=0,1 bash scripts/v1_5/eval/hallusionbench.sh
+```
+
 ## More Benchmarks
 
 Below are awesome benchmarks for multimodal understanding from the research community, that are not initially included in the LLaVA-1.5 release.
diff --git a/llava/eval/model_vqa_hallusionbench.py b/llava/eval/model_vqa_hallusionbench.py
new file mode 100644
index 000000000..cdda77423
--- /dev/null
+++ b/llava/eval/model_vqa_hallusionbench.py
@@ -0,0 +1,134 @@
+import argparse
+import torch
+import os
+import json
+from tqdm import tqdm
+import shortuuid
+
+from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from llava.conversation import conv_templates, SeparatorStyle
+from llava.model.builder import load_pretrained_model
+from llava.utils import disable_torch_init
+from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
+from llava.eval.maya.eval_utils import load_maya_model
+
+from PIL import Image
+import math
+
+# TODO: fix answer generation, as all results are 2
+
+def split_list(lst, n):
+    """Split a list into n (roughly) equal-sized chunks"""
+    chunk_size = math.ceil(len(lst) / n)
+    return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
+
+
+def get_chunk(lst, n, k):
+    chunks = split_list(lst, n)
+    return chunks[k]
+
+
+def eval_model(args):
+    # Model
+    disable_torch_init()
+    model_path = os.path.expanduser(args.model_path)
+    model_name = get_model_name_from_path(model_path)
+    if 'maya' not in model_name:
+        tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
+    else:
+        model, tokenizer, image_processor, context_len = load_maya_model(args.model_base, model_path, mode=args.mode)
+
+    questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
+    questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
+    answers_file = os.path.expanduser(args.answers_file)
+    os.makedirs(os.path.dirname(answers_file), exist_ok=True)
+    ans_file = open(answers_file, "w")
+
+    for line in tqdm(questions):
+        idx = line["question_id"]
+        image_file = line.get("image")  # May be None for text-only questions
+        qs = line["text"]
+        
+        # Always create a conversation, but handle image token differently
+        conv = conv_templates[args.conv_mode].copy()
+        
+        if image_file is None:
+            # Text-only question - don't add image token
+            conv.append_message(conv.roles[0], qs)
+            image_tensor = None
+        else:
+            # Image question - add image token
+            if model.config.mm_use_im_start_end:
+                qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
+            else:
+                qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
+            conv.append_message(conv.roles[0], qs)
+            
+            # Process image
+            image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
+            image_tensor = process_images([image], image_processor, model.config)[0].unsqueeze(0).half().cuda()
+
+        # Add response placeholder and get prompt
+        conv.append_message(conv.roles[1], None)
+        prompt = conv.get_prompt()
+
+        # Tokenize input
+        input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
+
+        # Generate response
+        with torch.inference_mode():
+            outputs = model.generate(
+                inputs=input_ids,
+                images=image_tensor if image_file is not None else None,
+                do_sample=True if args.temperature > 0 else False,
+                temperature=args.temperature,
+                max_new_tokens=30,
+                use_cache=True,
+            )
+
+        output_text = tokenizer.batch_decode(outputs[:, input_ids.shape[1]:], skip_special_tokens=True)[0]
+        
+        # Convert output to HallusionBench format (0=No, 1=Yes, 2=Uncertain)
+        output_text = output_text.lower().strip()
+        if "yes" in output_text:
+            answer = "1"
+        elif "no" in output_text:
+            answer = "0"
+        else:
+            answer = "2"
+
+        # Save result
+        ans_id = shortuuid.uuid()
+        ans_file.write(json.dumps({
+            "question_id": idx,
+            "prompt": qs,
+            "text": answer,
+            "answer_id": ans_id,
+            "model_id": model_name,
+            "metadata": {
+                "raw_response": output_text,
+                "category": line.get("category"),
+                "subcategory": line.get("subcategory"),
+                "set_id": line.get("set_id"),
+                "figure_id": line.get("figure_id"),
+            }
+        }) + "\n")
+        ans_file.flush()
+
+    ans_file.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model-path", type=str, default="nahidalam/maya_full_ft")
+    parser.add_argument("--model-base", type=str, default="CohereForAI/aya-23-8B")
+    parser.add_argument("--mode", type=str, default="finetuned")
+    parser.add_argument("--image-folder", type=str, default="")
+    parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
+    parser.add_argument("--answers-file", type=str, default="answer.jsonl")
+    parser.add_argument("--conv-mode", type=str, default="llava_v1")
+    parser.add_argument("--num-chunks", type=int, default=1)
+    parser.add_argument("--chunk-idx", type=int, default=0)
+    parser.add_argument("--temperature", type=float, default=0.2)
+    args = parser.parse_args()
+
+    eval_model(args)
diff --git a/scripts/convert_hallusionbench_for_eval.py b/scripts/convert_hallusionbench_for_eval.py
new file mode 100644
index 000000000..5fe2800aa
--- /dev/null
+++ b/scripts/convert_hallusionbench_for_eval.py
@@ -0,0 +1,48 @@
+import json
+import os
+from tqdm import tqdm
+
+def convert_for_eval(src_file, dst_file):
+    """Convert HallusionBench format to LLaVA format."""
+    with open(src_file, 'r') as f:
+        data = json.load(f)
+    
+    converted = []
+    for item in tqdm(data):
+        # Get image path if visual input is required
+        image_file = None
+        if int(item['visual_input']) != 0:
+            image_file = os.path.join(
+                item['category'],
+                item['subcategory'],
+                f"{item['set_id']}_{item['figure_id']}.png"
+            )
+        
+        # Convert to LLaVA format
+        converted_item = {
+            "question_id": f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}",
+            "image": image_file,
+            "text": item['question'],
+            "category": item['category'],
+            "subcategory": item['subcategory'],
+            "set_id": item['set_id'],
+            "figure_id": item['figure_id'],
+            "visual_input": item['visual_input'],
+            "gt_answer": item['gt_answer'],
+            "gt_answer_details": item['gt_answer_details']
+        }
+        converted.append(converted_item)
+    
+    # Write to JSONL format
+    with open(dst_file, 'w') as f:
+        for item in converted:
+            f.write(json.dumps(item) + '\n')
+
+if __name__ == '__main__':
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, default="HallusionBench.json")
+    parser.add_argument("--dst", type=str, default="llava-hallusionbench.jsonl")
+    args = parser.parse_args()
+    
+    convert_for_eval(args.src, args.dst)
diff --git a/scripts/convert_hallusionbench_result_for_eval.py b/scripts/convert_hallusionbench_result_for_eval.py
new file mode 100644
index 000000000..f8d090719
--- /dev/null
+++ b/scripts/convert_hallusionbench_result_for_eval.py
@@ -0,0 +1,42 @@
+import json
+import argparse
+from tqdm import tqdm
+def convert_result(data_file, src_file, dst_file):
+    """Convert LLaVA output format back to HallusionBench format."""
+    # Read original HallusionBench data
+    with open(data_file, 'r') as f:
+        orig_data = json.load(f)
+    
+    # Create lookup dictionary
+    results = {}
+    with open(src_file, 'r') as f:
+        for line in f:
+            item = json.loads(line)
+            results[item['question_id']] = item['text']
+    
+    # Add model predictions to original data
+    for item in tqdm(orig_data):
+        qid = f"{item['category']}_{item['subcategory']}_{item['set_id']}_{item['figure_id']}_{item['question_id']}"
+        response = results.get(qid, "")
+        
+        # Convert response to required format ("0", "1", or "2")
+        response = response.lower().strip()
+        if "yes" in response:
+            item['model_prediction'] = "1"
+        elif "no" in response:
+            item['model_prediction'] = "0"
+        else:
+            item['model_prediction'] = "2"
+    
+    # Save results
+    with open(dst_file, 'w') as f:
+        json.dump(orig_data, f, indent=2)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data", type=str, help="Original HallusionBench data")
+    parser.add_argument("--src", type=str, help="Source file with model outputs")
+    parser.add_argument("--dst", type=str, help="Destination file for evaluation")
+    args = parser.parse_args()
+    
+    convert_result(args.data, args.src, args.dst)
diff --git a/scripts/maya/eval/hallusionbench.sh b/scripts/maya/eval/hallusionbench.sh
new file mode 100644
index 000000000..5edf6cfc0
--- /dev/null
+++ b/scripts/maya/eval/hallusionbench.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+
+pip install prettytable openai==0.28
+
+gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
+IFS=',' read -ra GPULIST <<< "$gpu_list"
+
+CHUNKS=${#GPULIST[@]}
+
+CKPT="maya_toxicity_free_finetuned"
+# CKPT="maya_full_ft"
+SPLIT="llava-hallusionbench"
+EVAL_DIR="./playground/data/eval/hallusion-bench"
+
+mkdir -p ${EVAL_DIR}/answers/${CKPT}
+
+# Convert HallusionBench format to LLaVA format
+python scripts/convert_hallusionbench_for_eval.py \
+    --src ${EVAL_DIR}/HallusionBench.json \
+    --dst ${EVAL_DIR}/${SPLIT}.jsonl
+
+# Run model on chunks
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_hallusionbench \
+        --model-path nahidalam/${CKPT} \
+        --model-base CohereForAI/aya-23-8B \
+        --question-file ${EVAL_DIR}/${SPLIT}.jsonl \
+        --image-folder ${EVAL_DIR}/images \
+        --answers-file ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl \
+        --num-chunks $CHUNKS \
+        --chunk-idx $IDX \
+        --temperature 0 \
+        --conv-mode aya &
+done
+
+wait
+
+# Merge results
+output_file=${EVAL_DIR}/answers/${CKPT}/merge.jsonl
+> "$output_file"
+for IDX in $(seq 0 $((CHUNKS-1))); do
+    cat ${EVAL_DIR}/answers/${CKPT}/${CHUNKS}_${IDX}.jsonl >> "$output_file"
+done
+
+mkdir -p ${EVAL_DIR}/results/${CKPT}
+# Convert results back to HallusionBench format
+python scripts/convert_hallusionbench_result_for_eval.py \
+    --data ${EVAL_DIR}/HallusionBench.json \
+    --src ${EVAL_DIR}/answers/${CKPT}/merge.jsonl \
+    --dst ${EVAL_DIR}/results/${CKPT}/HallusionBench_result.json
+
+# Run evaluation
+cd ${EVAL_DIR}
+python evaluation.py --model ${CKPT}