From dc401d3a4e33d1e65490e1355bc82215e718dc19 Mon Sep 17 00:00:00 2001
From: NielsRogge <48327001+NielsRogge@users.noreply.github.com>
Date: Wed, 1 May 2024 17:58:01 +0200
Subject: [PATCH] Improve object detection task guideline (#29967)

* Add improvements

* Address comment
---
 docs/source/en/tasks/object_detection.md | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 273484bbb3ef02..0d640ca459cc26 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -41,11 +41,11 @@ To see all architectures and checkpoints compatible with this task, we recommend
 Before you begin, make sure you have all the necessary libraries installed:
 
 ```bash
-pip install -q datasets transformers evaluate timm albumentations
+pip install -q datasets transformers accelerate evaluate albumentations
 ```
 
 You'll use 🤗 Datasets to load a dataset from the Hugging Face Hub, 🤗 Transformers to train your model,
-and `albumentations` to augment the data. `timm` is currently required to load a convolutional backbone for the DETR model.
+and `albumentations` to augment the data.
 
 We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the Hub.
 When prompted, enter your token to log in:
@@ -342,6 +342,7 @@ and `id2label` maps that you created earlier from the dataset's metadata. Additi
 ...     id2label=id2label,
 ...     label2id=label2id,
 ...     ignore_mismatched_sizes=True,
+...     revision="no_timm", # DETR models can be loaded without timm
 ... )
 ```
 
@@ -357,7 +358,7 @@ Face to upload your model).
 >>> training_args = TrainingArguments(
 ...     output_dir="detr-resnet-50_finetuned_cppe5",
 ...     per_device_train_batch_size=8,
-...     num_train_epochs=10,
+...     num_train_epochs=100,
 ...     fp16=True,
 ...     save_steps=200,
 ...     logging_steps=50,
@@ -487,10 +488,10 @@ Next, prepare an instance of a `CocoDetection` class that can be used with `coco
 ...         return {"pixel_values": pixel_values, "labels": target}
 
 
->>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
 
 >>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
->>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
+>>> test_ds_coco_format = CocoDetection(path_output_cppe5, image_processor, path_anno)
 ```
 
 Finally, load the metrics and run the evaluation.
@@ -505,10 +506,13 @@ Finally, load the metrics and run the evaluation.
 ...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
 ... )
 
+>>> device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
+
 >>> with torch.no_grad():
 ...     for idx, batch in enumerate(tqdm(val_dataloader)):
-...         pixel_values = batch["pixel_values"]
-...         pixel_mask = batch["pixel_mask"]
+...         pixel_values = batch["pixel_values"].to(device)
+...         pixel_mask = batch["pixel_mask"].to(device)
 
 ...         labels = [
 ...             {k: v for k, v in t.items()} for t in batch["labels"]
@@ -518,8 +522,9 @@ Finally, load the metrics and run the evaluation.
 ...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
 
 ...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
-...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
-
+...         # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
+...         results = image_processor.post_process_object_detection(outputs, threshold=0, target_sizes=orig_target_sizes)  
+...
 ...         module.add(prediction=results, reference=labels)
 ...         del batch