Support sugar crepe for compositionality evaluation (#102)

* add sugar crepe for compositionality evaluation * minor * add sugar crepe dataset collection * minor
LAION-AI · Jul 11, 2023 · 2148557 · 2148557
1 parent 269cc67
commit 2148557
Show file tree

Hide file tree

Showing 6 changed files with 8,196 additions and 3,873 deletions.
diff --git a/README.md b/README.md
@@ -198,6 +198,23 @@ For Flickr-8k (zero-shot retrieval)
 - `clip_benchmark eval --model xlm-roberta-base-ViT-B-32 --pretrained laion5b_s13b_b90k --dataset=flickr8k --output=result.json --batch_size=64 --language=<LANG>`, where `<LANG>` can be among `en` (english), `zh` (chinese).
 
 
+### Compositionality evaluation
+
+
+For [Sugar Crepe](https://github.com/RAIVNLab/sugar-crepe):
+
+
+`clip_benchmark eval --model ViT-B-32 --pretrained laion400m_e32 --dataset=sugar_crepe/<TASK> --output=result.json`
+
+where `<TASK>` can be among  `"add_att"`, `add_obj``, `replace_att`, `replace_obj`, `replace_rel`, `swap_att`, `swap_obj`.
+To evaluate on all the tasks together, you can do:
+
+
+`clip_benchmark eval --model ViT-B-32 --pretrained laion400m_e32 --dataset=sugar_crepe --output=result.json`
+
+
+
+
 ### Webdataset example
 
 Here is an example on how to run it on [webdatasets](https://github.com/webdataset/webdataset).
@@ -334,6 +351,7 @@ python setup.py install
 - Thanks to [SLIP](https://github.com/facebookresearch/SLIP) authors, some zero-shot templates and classnames are from there.
 - Thanks to [Wise-ft](https://github.com/mlfoundations/wise-ft) authors, Imagenet robustness datasets code is adapted from there
 - Thanks to [LiT](https://arxiv.org/abs/2111.07991.pdf) authors, some zero-shot templates and classnames of VTAB datasets are from there.
+- Thanks to [Sugar Crepe](https://github.com/RAIVNLab/sugar-crepe) authors for compositionality tasks evaluation on COCO
 - Thanks to [Babel ImageNet](https://github.com/gregor-ge/Babel-ImageNet) authors for multilingual evaluation of ImageNet-1k zero-shot classification.
 - Thanks to [ImageNet-W](https://github.com/facebookresearch/Whac-A-Mole) authors for ImageNet-W evaluation
 - Thanks to [CuPL](https://github.com/sarahpratt/CuPL) for CuPL prompts.

diff --git a/benchmark/results.ipynb b/benchmark/results.ipynb
diff --git a/clip_benchmark/cli.py b/clip_benchmark/cli.py
@@ -7,7 +7,7 @@
 from copy import copy
 import os
 from clip_benchmark.datasets.builder import build_dataset, get_dataset_collate_fn, get_dataset_default_task, dataset_collection, get_dataset_collection_from_file
-from clip_benchmark.metrics import zeroshot_classification, zeroshot_retrieval, linear_probe, captioning
+from clip_benchmark.metrics import image_caption_selection, zeroshot_classification, zeroshot_retrieval, linear_probe, captioning
 from clip_benchmark.model_collection import get_model_collection_from_file, model_collection
 from clip_benchmark.models import load_clip, MODEL_TYPES
 
@@ -22,7 +22,7 @@ def get_parser_args():
     parser_eval.add_argument('--model', type=str, default="ViT-B-32-quickgelu", help="Model architecture to use from OpenCLIP")
     parser_eval.add_argument('--pretrained', type=str, default="laion400m_e32", help="Model checkpoint name to use from OpenCLIP")
     parser_eval.add_argument('--pretrained_model', type=str, default="", nargs="+", help="Pre-trained model(s) to use. Can be the full model name where `model` and `pretrained` are comma separated (e.g., --pretrained_model='ViT-B-32-quickgelu,laion400m_e32'), a model collection name ('openai' or 'openclip_base' or 'openclip_multilingual' or 'openclip_all'), or path of a text file where each line is a model fullname where model and pretrained are comma separated (e.g., ViT-B-32-quickgelu,laion400m_e32). --model and --pretrained are ignored if --pretrained_model is used.")
-    parser_eval.add_argument('--task', type=str, default="auto", choices=["zeroshot_classification", "zeroshot_retrieval", "linear_probe", "captioning", "auto"], help="Task to evaluate on. With --task=auto, the task is automatically inferred from the dataset.")
+    parser_eval.add_argument('--task', type=str, default="auto", choices=["zeroshot_classification", "zeroshot_retrieval", "linear_probe", "captioning", "image_caption_selection", "auto"], help="Task to evaluate on. With --task=auto, the task is automatically inferred from the dataset.")
     parser_eval.add_argument('--no_amp', action="store_false", dest="amp", default=True, help="whether to use mixed precision")
     parser_eval.add_argument('--num_workers', default=4, type=int)
     parser_eval.add_argument('--recall_k', default=[5], type=int, help="for retrieval, select the k for Recall@K metric. ", nargs="+",)
@@ -245,6 +245,14 @@ def run(args):
             device=args.device, 
             amp=args.amp
         )
+    elif task == "image_caption_selection":
+        metrics = image_caption_selection.evaluate(
+            model,
+            dataloader,
+            tokenizer,
+            device=args.device,
+            amp=args.amp,
+        )
     elif task == "linear_probe":
         # we also need the train split for linear probing.
         train_dataset = build_dataset(