Official codebase for paper "Scaling Inference-Time Search with Vision Value Model for Improved Visual Comprehension".
conda env create -f environment.yml
cp ./utils/modeling_llava_next.py ~/.conda/envs/visvm/lib/python3.10/site-packages/transformers/models/llava_next/
cp ./utils/trainer/td_trainer.py ~/.conda/envs/visvm/lib/python3.10/site-packages/trl/trainer/
cp ./utils/__init__.py ~/.conda/envs/visvm/lib/python3.10/site-packages/trl/
cp ./utils/trainer/__init__.py ~/.conda/envs/visvm/lib/python3.10/site-packages/trl/trainer/
bash ./script/batch_generate.sh
bash ./script/clip_score.sh
bash ./script/train_value.sh
bash ./script/train_sft.sh