huggingface · yonigozlan · Dec 21, 2024
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr_fast.py
@@ -1,19 +1,9 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Fast Image processor class for Deformable DETR."""
-
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deformable_detr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 import functools
 import pathlib
 from typing import Any, Dict, List, Optional, Tuple, Union
@@ -26,10 +16,7 @@
     get_max_height_width,
     safe_squeeze,
 )
-from ...image_transforms import (
-    center_to_corners_format,
-    corners_to_center_format,
-)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
@@ -43,7 +30,6 @@
     get_image_type,
     infer_channel_dimension_format,
     make_list_of_images,
-    pil_torch_interpolation_mapping,
     validate_annotations,
     validate_kwargs,
 )
@@ -55,32 +41,30 @@
     is_vision_available,
     logging,
 )
-from .image_processing_deformable_detr import (
-    get_size_with_aspect_ratio,
-)
+from .image_processing_deformable_detr import get_size_with_aspect_ratio
 
 
 if is_torch_available():
     import torch
 
-if is_torchvision_available():
-    from torchvision.io import read_image
+if is_vision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_vision_available():
-        from ...image_utils import pil_torch_interpolation_mapping
 
-    if is_torchvision_v2_available():
-        from torchvision.transforms.v2 import functional as F
-    else:
-        from torchvision.transforms import functional as F
+if is_torchvision_v2_available():
+    from torchvision.io import read_image
+    from torchvision.transforms.v2 import functional as F
+elif is_torchvision_available():
+    from torchvision.io import read_image
+    from torchvision.transforms import functional as F
 
 
 logger = logging.get_logger(__name__)
 
 SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
 def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
     """
     Convert a COCO polygon annotation to a mask.
@@ -115,15 +99,15 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: to
     return masks
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr
+# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50
 def prepare_coco_detection_annotation(
     image,
     target,
     return_segmentation_masks: bool = False,
     input_data_format: Optional[Union[ChannelDimension, str]] = None,
 ):
     """
-    Convert the target in COCO format into the format expected by DeformableDetr.
+    Convert the target in COCO format into the format expected by DEFORMABLE_DETR.
     """
     image_height, image_width = image.size()[-2:]
 
@@ -180,7 +164,6 @@ def prepare_coco_detection_annotation(
     return new_target
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes
 def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
     """
     Compute the bounding boxes around the provided panoptic segmentation masks.
@@ -215,7 +198,9 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
     return torch.stack([x_min, y_min, x_max, y_max], 1)
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id
+# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
+# Copyright (c) 2018, Alexander Kirillov
+# All rights reserved.
 def rgb_to_id(color):
     """
     Converts RGB color to unique ID.
@@ -227,7 +212,6 @@ def rgb_to_id(color):
     return int(color[0] + 256 * color[1] + 256 * 256 * color[2])
 
 
-# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr
 def prepare_coco_panoptic_annotation(
     image: torch.Tensor,
     target: Dict,
@@ -236,7 +220,7 @@ def prepare_coco_panoptic_annotation(
     input_data_format: Union[ChannelDimension, str] = None,
 ) -> Dict:
     """
-    Prepare a coco panoptic annotation for DeformableDetr.
+    Prepare a coco panoptic annotation for DEFORMABLE_DETR.
     """
     image_height, image_width = get_image_size(image, channel_dim=input_data_format)
     annotation_path = pathlib.Path(masks_path) / target["file_name"]
@@ -279,13 +263,13 @@ def prepare_coco_panoptic_annotation(
 
 class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
     r"""
-    Constructs a fast Deformable DETR image processor.
+    Constructs a fast DeformableDetr image processor.
 
     Args:
         format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
             Data format of the annotations. One of "coco_detection" or "coco_panoptic".
         do_resize (`bool`, *optional*, defaults to `True`):
-            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
             overridden by the `do_resize` parameter in the `preprocess` method.
         size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
             Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
@@ -316,7 +300,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
             Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
             for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_annotations (`bool`, *optional*, defaults to `True`):
-            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
             bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
             Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
         do_pad (`bool`, *optional*, defaults to `True`):
@@ -332,7 +316,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
 
     model_input_names = ["pixel_values", "pixel_mask"]
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__
     def __init__(
         self,
         format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
@@ -404,7 +387,6 @@ def __init__(
         ]
 
     @classmethod
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr
     def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
         """
         Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
@@ -418,7 +400,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
             image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
         return super().from_dict(image_processor_dict, **kwargs)
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr
     def prepare_annotation(
         self,
         image: torch.Tensor,
@@ -429,7 +410,7 @@ def prepare_annotation(
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
     ) -> Dict:
         """
-        Prepare an annotation for feeding into DeformableDetr model.
+        Prepare an annotation for feeding into DEFORMABLE_DETR model.
         """
         format = format if format is not None else self.format
 
@@ -451,7 +432,6 @@ def prepare_annotation(
             raise ValueError(f"Format {format} is not supported.")
         return target
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
     def resize(
         self,
         image: torch.Tensor,
@@ -506,7 +486,6 @@ def resize(
         )
         return image
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
     def resize_annotation(
         self,
         annotation: Dict[str, Any],
@@ -560,7 +539,6 @@ def resize_annotation(
 
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
     def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
         image_height, image_width = image_size
         norm_annotation = {}
@@ -576,7 +554,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
                 norm_annotation[key] = value
         return norm_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
     def _update_annotation_for_padded_image(
         self,
         annotation: Dict,
@@ -612,7 +589,6 @@ def _update_annotation_for_padded_image(
                 new_annotation[key] = value
         return new_annotation
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
     def pad(
         self,
         image: torch.Tensor,
@@ -644,7 +620,6 @@ def pad(
         return image, pixel_mask, annotation
 
     @functools.lru_cache(maxsize=1)
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
     def _validate_input_arguments(
         self,
         do_rescale: bool,
@@ -673,7 +648,6 @@ def _validate_input_arguments(
         if do_normalize and None in (image_mean, image_std):
             raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
 
-    # Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess
     def preprocess(
         self,
         images: ImageInput,
@@ -874,7 +848,7 @@ def preprocess(
         processed_annotations = []
         pixel_masks = []  # Initialize pixel_masks here
         for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
-            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            # prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image)
             if annotations is not None:
                 annotation = self.prepare_annotation(
                     image,
@@ -950,7 +924,6 @@ def preprocess(
             ]
         return encoded_inputs
 
-    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process
     def post_process(self, outputs, target_sizes):
         """
         Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
@@ -996,7 +969,6 @@ def post_process(self, outputs, target_sizes):
 
         return results
 
-    # Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection
     def post_process_object_detection(
         self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
     ):