From baebefd6da2f614948f1aa572302091d3760947d Mon Sep 17 00:00:00 2001
From: sangbumchoi <danielsejong55@gmail.com>
Date: Wed, 7 Aug 2024 00:17:11 +0000
Subject: [PATCH] tmp

---
 examples/pytorch/zero-shot/README.md          | 29 ++++---
 ...n_zero_shot_object_detection_no_trainer.py | 86 +++++++++++++++----
 2 files changed, 83 insertions(+), 32 deletions(-)

diff --git a/examples/pytorch/zero-shot/README.md b/examples/pytorch/zero-shot/README.md
index 34d18393c7c4d2..6bb2cca13e7496 100644
--- a/examples/pytorch/zero-shot/README.md
+++ b/examples/pytorch/zero-shot/README.md
@@ -27,11 +27,11 @@ Content:
 
 ## PyTorch version, Trainer
 
-Based on the script [`run_object_detection.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+Based on the script [`run_zero_shot_object_detection.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/zero-shot/run_zero_shot_object_detection.py).
 
 The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transformers/main_classes/trainer) to automatically take care of the training for you, running on distributed environments right away.
 
-Here we show how to fine-tune a [DETR](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset:
+Here we show how to fine-tune a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset:
 
 ```bash
 python run_zero_shot_object_detection.py \
@@ -39,7 +39,7 @@ python run_zero_shot_object_detection.py \
     --dataset_name cppe-5 \
     --do_train true \
     --do_eval true \
-    --output_dir grounding-dino-tiny-finetuned-cppe-5-10k-steps \
+    --output_dir grounding-dino-tiny-finetuned-cppe5-10k-steps \
     --num_train_epochs 100 \
     --image_square_size 600 \
     --fp16 true \
@@ -69,7 +69,7 @@ python run_zero_shot_object_detection.py \
 `--eval_do_concat_batches false` is required for correct evaluation of detection models;  
 `--ignore_mismatched_sizes true` is required to load detection model for finetuning with different number of classes.
 
-The resulting model can be seen here: https://huggingface.co/qubvel-hf/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
+The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-10k-steps. The corresponding Weights and Biases report [here](https://api.wandb.ai/links/qubvel-hf-co/bnm0r5ex). Note that it's always advised to check the original paper to know the details regarding training hyperparameters. Hyperparameters for current example were not tuned. To improve model quality you could try:
  - changing image size parameters (`--shortest_edge`/`--longest_edge`)
  - changing training parameters, such as learning rate, batch size, warmup, optimizer and many more (see [TrainingArguments](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments))
  - adding more image augmentations (we created a helpful [HF Space](https://huggingface.co/spaces/qubvel-hf/albumentations-demo) to choose some)
@@ -82,7 +82,7 @@ For dataset, make sure it provides labels in the same format as [CPPE-5](https:/
 
 ## PyTorch version, no Trainer
 
-Based on the script [`run_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_object_detection.py).
+Based on the script [`run_zero_shot_object_detection_no_trainer.py`](https://github.com/huggingface/transformers/blob/main/examples/pytorch/object-detection/run_zero_shot_object_detection.py).
 
 The script leverages [🤗 `Accelerate`](https://github.com/huggingface/accelerate), which allows to write your own training loop in PyTorch, but have it run instantly on any (distributed) environment, including CPU, multi-CPU, GPU, multi-GPU and TPU. It also supports mixed precision.
 
@@ -104,8 +104,8 @@ that will check everything is ready for training. Finally, you can launch traini
 accelerate launch run_zero_shot_object_detection_no_trainer.py \
     --model_name_or_path "IDEA-Research/grounding-dino-tiny" \
     --dataset_name cppe-5 \
-    --output_dir "grounding-dino-tiny-finetuned" \
-    --num_train_epochs 100 \
+    --output_dir "grounding-dino-tiny-finetuned-cppe5-10k-steps-no-trainer" \
+    --num_train_epochs 10 \
     --image_square_size 600 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
@@ -118,7 +118,7 @@ accelerate launch run_zero_shot_object_detection_no_trainer.py \
 
 and boom, you're training, possibly on multiple GPUs, logging everything to all trackers found in your environment (like Weights and Biases, Tensorboard) and regularly pushing your model to the hub (with the repo name being equal to `args.output_dir` at your HF username) 🤗
 
-With the default settings, the script fine-tunes a [DETR](https://huggingface.co/facebook/detr-resnet-50) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/qubvel-hf/detr-resnet-50-finetuned-10k-cppe5-no-trainer. 
+With the default settings, the script fine-tunes a [GroundingDino](https://huggingface.co/IDEA-Research/grounding-dino-tiny) model on the [CPPE-5](https://huggingface.co/datasets/cppe-5) dataset. The resulting model can be seen here: https://huggingface.co/danelcsb/grounding-dino-tiny-finetuned-10k-cppe5-no-trainer. 
 
 
 ## Reload and perform inference
@@ -130,20 +130,21 @@ import requests
 import torch
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoModelForObjectDetection
+from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 
 # Name of repo on the hub or path to a local folder
-model_name = "qubvel-hf/detr-resnet-50-finetuned-10k-cppe5"
+model_name = "danelcsb/grounding-dino-tiny-finetuned-10k-cppe5"
 
-image_processor = AutoImageProcessor.from_pretrained(model_name)
-model = AutoModelForObjectDetection.from_pretrained(model_name)
+image_processor = AutoProcessor.from_pretrained(model_name)
+model = AutoModelForZeroShotObjectDetection.from_pretrained(model_name)
 
 # Load image for inference
 url = "https://images.pexels.com/photos/8413299/pexels-photo-8413299.jpeg?auto=compress&cs=tinysrgb&w=630&h=375&dpr=2"
 image = Image.open(requests.get(url, stream=True).raw)
+text = "Coverall. Face_Shield. Gloves. Goggles. Mask"
 
 # Prepare image for the model
-inputs = image_processor(images=image, return_tensors="pt")
+inputs = image_processor(images=image, text=text, return_tensors="pt")
 
 with torch.no_grad():
     outputs = model(**inputs)
@@ -152,7 +153,7 @@ with torch.no_grad():
 # this include conversion to Pascal VOC format and filtering non confident boxes
 width, height = image.size
 target_sizes = torch.tensor([height, width]).unsqueeze(0)  # add batch dim
-results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+results = processor.post_process_grounded_object_detection(outputs, inputs.input_ids, box_threshold=0.15, text_threshold=0.1, target_sizes=target_sizes)[0]
 
 for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
     box = [round(i, 2) for i in box.tolist()]
diff --git a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
index d497204e7564b7..99a6ac04a924e4 100644
--- a/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
+++ b/examples/pytorch/zero-shot/run_zero_shot_object_detection_no_trainer.py
@@ -27,7 +27,7 @@
 import datasets
 import numpy as np
 import torch
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedDataParallelKwargs
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
 from datasets import load_dataset
@@ -118,6 +118,50 @@ def convert_bbox_yolo_to_pascal(boxes: torch.Tensor, image_size: Tuple[int, int]
     return boxes
 
 
+def convert_zero_shot_to_coco_format(predictions, label2id):
+    """
+    Convert zershot format output to typical object detection format in order to calculate mAP.
+
+    Args:
+        predictions (Dict): Output of zero-shot object detection
+            e.g. 
+                {
+                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
+                    'labels': ['a cat', 'a cat', 'a remote control'], 
+                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
+                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
+                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
+                }
+        label2id (Dict): Dictionary of label to id mapping
+
+    Returns:
+        Dict: Output of zero-shot object detection
+            e.g. 
+                {
+                    'scores': tensor([0.4786, 0.4379, 0.4760], device='cuda:0'), 
+                    'labels': tensor([1, 1, 2], device='cuda:0'), 
+                    'boxes': tensor([[344.6973,  23.1085, 637.1817, 374.2748],
+                                    [ 12.2690,  51.9104, 316.8564, 472.4341],
+                                    [ 38.5870,  70.0092, 176.7755, 118.1748]], device='cuda:0')
+                }
+    """
+    # convert center to corners format
+    torch_label = []
+    for prediction in predictions:
+        scores = prediction['scores']
+        device = scores.device
+        labels = prediction['labels']
+        for label in labels:
+            if label in label2id:
+                torch_label.append(label)
+            else:
+                # Give background class
+                torch_label.append(0)
+        prediction['labels'] = torch.Tensor(torch_label).to(device)
+
+    return predictions
+
+
 # Copied from examples/pytorch/object-detection/run_object_detection.augment_and_transform_batch
 def augment_and_transform_batch(
     examples: Mapping[str, Any],
@@ -187,6 +231,7 @@ def evaluation_loop(
     accelerator: Accelerator,
     dataloader: DataLoader,
     id2label: Mapping[int, str],
+    label2id: Mapping[str, int],
 ) -> dict:
     model.eval()
     metric = MeanAveragePrecision(box_format="xyxy", class_metrics=True)
@@ -201,8 +246,10 @@ def evaluation_loop(
         # processor convert boxes from YOLO format to Pascal VOC format
         # ([x_min, y_min, x_max, y_max] in absolute coordinates)
         image_size = torch.stack([example["orig_size"] for example in batch["labels"]], dim=0)
-        predictions = processor.post_process_grounded_object_detection(outputs, threshold=0.0, target_sizes=image_size)
+        input_ids = torch.stack([input_ids for input_ids in batch["input_ids"]], dim=0)
+        predictions = processor.post_process_grounded_object_detection(outputs, input_ids, box_threshold=0.0, text_threshold=0.0, target_sizes=image_size)
         predictions = nested_to_cpu(predictions)
+        predictions = convert_zero_shot_to_coco_format(predictions, label2id)
 
         # 2. Collect ground truth boxes in the same format for metric computation
         # Do the same, convert YOLO boxes to Pascal VOC format
@@ -215,19 +262,20 @@ def evaluation_loop(
 
         metric.update(predictions, target)
 
-    metrics = metric.compute()
+    # metrics = metric.compute()
 
-    # Replace list of per class metrics with separate metric for each class
-    classes = metrics.pop("classes")
-    map_per_class = metrics.pop("map_per_class")
-    mar_100_per_class = metrics.pop("mar_100_per_class")
-    for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
-        class_name = id2label[class_id.item()]
-        metrics[f"map_{class_name}"] = class_map
-        metrics[f"mar_100_{class_name}"] = class_mar
+    # # Replace list of per class metrics with separate metric for each class
+    # classes = metrics.pop("classes")
+    # map_per_class = metrics.pop("map_per_class")
+    # mar_100_per_class = metrics.pop("mar_100_per_class")
+    # for class_id, class_map, class_mar in zip(classes, map_per_class, mar_100_per_class):
+    #     class_name = id2label[class_id.item()]
+    #     metrics[f"map_{class_name}"] = class_map
+    #     metrics[f"mar_100_{class_name}"] = class_mar
 
-    # Convert metrics to float
-    metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+    # # Convert metrics to float
+    # metrics = {k: round(v.item(), 4) for k, v in metrics.items()}
+    metrics = {}
 
     return metrics
 
@@ -412,6 +460,7 @@ def main():
     if args.with_tracking:
         accelerator_log_kwargs["log_with"] = args.report_to
         accelerator_log_kwargs["project_dir"] = args.output_dir
+        accelerator_log_kwargs["kwargs_handlers"] = [DistributedDataParallelKwargs(find_unused_parameters=True)]
 
     accelerator = Accelerator(gradient_accumulation_steps=args.gradient_accumulation_steps, **accelerator_log_kwargs)
 
@@ -463,7 +512,6 @@ def main():
     # Get dataset categories and prepare mappings for label_name <-> label_id
     categories = dataset["train"].features["objects"].feature["category"].names
     id2label = dict(enumerate(categories))
-    prompt = ". ".join(id2label.values()) + "."
     label2id = {v: k for k, v in id2label.items()}
 
     # ------------------------------------------------------------------------------------------------
@@ -522,11 +570,13 @@ def main():
     )
 
     # Make transform functions for batch and apply for dataset splits
+    prompt = ". ".join(id2label.values()) + "."
+
     train_transform_batch = partial(
-        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt
+        augment_and_transform_batch, transform=train_augment_and_transform, processor=processor, prompt=prompt,
     )
     validation_transform_batch = partial(
-        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt
+        augment_and_transform_batch, transform=validation_transform, processor=processor, prompt=prompt,
     )
 
     with accelerator.main_process_first():
@@ -708,7 +758,7 @@ def main():
                 break
 
         logger.info("***** Running evaluation *****")
-        metrics = evaluation_loop(model, processor, accelerator, valid_dataloader, id2label)
+        metrics = evaluation_loop(model, processor, accelerator, valid_dataloader, id2label, label2id)
 
         logger.info(f"epoch {epoch}: {metrics}")
 
@@ -750,7 +800,7 @@ def main():
     # ------------------------------------------------------------------------------------------------
 
     logger.info("***** Running evaluation on test dataset *****")
-    metrics = evaluation_loop(model, processor, accelerator, test_dataloader, id2label)
+    metrics = evaluation_loop(model, processor, accelerator, test_dataloader, id2label, label2id)
     metrics = {f"test_{k}": v for k, v in metrics.items()}
 
     logger.info(f"Test metrics: {metrics}")