Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for modular with fast image processors #35379

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,19 +1,9 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Fast Image processor class for Deformable DETR."""

# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
# This file was automatically generated from src/transformers/models/deformable_detr/modular_deformable_detr.py.
# Do NOT edit this file manually as any edits will be overwritten by the generation of
# the file from the modular. If any change should be done, please apply the change to the
# modular_deformable_detr.py file directly. One of our CI enforces this.
# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
import functools
import pathlib
from typing import Any, Dict, List, Optional, Tuple, Union
Expand All @@ -26,10 +16,7 @@
get_max_height_width,
safe_squeeze,
)
from ...image_transforms import (
center_to_corners_format,
corners_to_center_format,
)
from ...image_transforms import center_to_corners_format, corners_to_center_format
from ...image_utils import (
IMAGENET_DEFAULT_MEAN,
IMAGENET_DEFAULT_STD,
Expand All @@ -43,7 +30,6 @@
get_image_type,
infer_channel_dimension_format,
make_list_of_images,
pil_torch_interpolation_mapping,
validate_annotations,
validate_kwargs,
)
Expand All @@ -55,32 +41,30 @@
is_vision_available,
logging,
)
from .image_processing_deformable_detr import (
get_size_with_aspect_ratio,
)
from .image_processing_deformable_detr import get_size_with_aspect_ratio


if is_torch_available():
import torch

if is_torchvision_available():
from torchvision.io import read_image
if is_vision_available():
from ...image_utils import pil_torch_interpolation_mapping

if is_vision_available():
from ...image_utils import pil_torch_interpolation_mapping

if is_torchvision_v2_available():
from torchvision.transforms.v2 import functional as F
else:
from torchvision.transforms import functional as F
if is_torchvision_v2_available():
from torchvision.io import read_image
from torchvision.transforms.v2 import functional as F
elif is_torchvision_available():
from torchvision.io import read_image
from torchvision.transforms import functional as F


logger = logging.get_logger(__name__)

SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)


# Copied from transformers.models.detr.image_processing_detr_fast.convert_coco_poly_to_mask
# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L33
def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: torch.device) -> torch.Tensor:
"""
Convert a COCO polygon annotation to a mask.
Expand Down Expand Up @@ -115,15 +99,15 @@ def convert_coco_poly_to_mask(segmentations, height: int, width: int, device: to
return masks


# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_detection_annotation with DETR->DeformableDetr
# inspired by https://github.com/facebookresearch/deformable_detr/blob/master/datasets/coco.py#L50
def prepare_coco_detection_annotation(
image,
target,
return_segmentation_masks: bool = False,
input_data_format: Optional[Union[ChannelDimension, str]] = None,
):
"""
Convert the target in COCO format into the format expected by DeformableDetr.
Convert the target in COCO format into the format expected by DEFORMABLE_DETR.
"""
image_height, image_width = image.size()[-2:]

Expand Down Expand Up @@ -180,7 +164,6 @@ def prepare_coco_detection_annotation(
return new_target


# Copied from transformers.models.detr.image_processing_detr_fast.masks_to_boxes
def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
"""
Compute the bounding boxes around the provided panoptic segmentation masks.
Expand Down Expand Up @@ -215,7 +198,9 @@ def masks_to_boxes(masks: torch.Tensor) -> torch.Tensor:
return torch.stack([x_min, y_min, x_max, y_max], 1)


# Copied from transformers.models.detr.image_processing_detr_fast.rgb_to_id
# 2 functions below adapted from https://github.com/cocodataset/panopticapi/blob/master/panopticapi/utils.py
# Copyright (c) 2018, Alexander Kirillov
# All rights reserved.
def rgb_to_id(color):
"""
Converts RGB color to unique ID.
Expand All @@ -227,7 +212,6 @@ def rgb_to_id(color):
return int(color[0] + 256 * color[1] + 256 * 256 * color[2])


# Copied from transformers.models.detr.image_processing_detr_fast.prepare_coco_panoptic_annotation with DETR->DeformableDetr
def prepare_coco_panoptic_annotation(
image: torch.Tensor,
target: Dict,
Expand All @@ -236,7 +220,7 @@ def prepare_coco_panoptic_annotation(
input_data_format: Union[ChannelDimension, str] = None,
) -> Dict:
"""
Prepare a coco panoptic annotation for DeformableDetr.
Prepare a coco panoptic annotation for DEFORMABLE_DETR.
"""
image_height, image_width = get_image_size(image, channel_dim=input_data_format)
annotation_path = pathlib.Path(masks_path) / target["file_name"]
Expand Down Expand Up @@ -279,13 +263,13 @@ def prepare_coco_panoptic_annotation(

class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
r"""
Constructs a fast Deformable DETR image processor.
Constructs a fast DeformableDetr image processor.

Args:
format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
Data format of the annotations. One of "coco_detection" or "coco_panoptic".
do_resize (`bool`, *optional*, defaults to `True`):
Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
Controls whether to resize the image's `(height, width)` dimensions to the specified `size`. Can be
overridden by the `do_resize` parameter in the `preprocess` method.
size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
Expand Down Expand Up @@ -316,7 +300,7 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):
Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
do_convert_annotations (`bool`, *optional*, defaults to `True`):
Controls whether to convert the annotations to the format expected by the DETR model. Converts the
Controls whether to convert the annotations to the format expected by the DEFORMABLE_DETR model. Converts the
bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
do_pad (`bool`, *optional*, defaults to `True`):
Expand All @@ -332,7 +316,6 @@ class DeformableDetrImageProcessorFast(BaseImageProcessorFast):

model_input_names = ["pixel_values", "pixel_mask"]

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.__init__
def __init__(
self,
format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
Expand Down Expand Up @@ -404,7 +387,6 @@ def __init__(
]

@classmethod
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.from_dict with Detr->DeformableDetr
def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
"""
Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is
Expand All @@ -418,7 +400,6 @@ def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs):
image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask")
return super().from_dict(image_processor_dict, **kwargs)

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.prepare_annotation with DETR->DeformableDetr
def prepare_annotation(
self,
image: torch.Tensor,
Expand All @@ -429,7 +410,7 @@ def prepare_annotation(
input_data_format: Optional[Union[str, ChannelDimension]] = None,
) -> Dict:
"""
Prepare an annotation for feeding into DeformableDetr model.
Prepare an annotation for feeding into DEFORMABLE_DETR model.
"""
format = format if format is not None else self.format

Expand All @@ -451,7 +432,6 @@ def prepare_annotation(
raise ValueError(f"Format {format} is not supported.")
return target

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize
def resize(
self,
image: torch.Tensor,
Expand Down Expand Up @@ -506,7 +486,6 @@ def resize(
)
return image

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.resize_annotation
def resize_annotation(
self,
annotation: Dict[str, Any],
Expand Down Expand Up @@ -560,7 +539,6 @@ def resize_annotation(

return new_annotation

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.normalize_annotation
def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict:
image_height, image_width = image_size
norm_annotation = {}
Expand All @@ -576,7 +554,6 @@ def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) ->
norm_annotation[key] = value
return norm_annotation

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._update_annotation_for_padded_image
def _update_annotation_for_padded_image(
self,
annotation: Dict,
Expand Down Expand Up @@ -612,7 +589,6 @@ def _update_annotation_for_padded_image(
new_annotation[key] = value
return new_annotation

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.pad
def pad(
self,
image: torch.Tensor,
Expand Down Expand Up @@ -644,7 +620,6 @@ def pad(
return image, pixel_mask, annotation

@functools.lru_cache(maxsize=1)
# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast._validate_input_arguments
def _validate_input_arguments(
self,
do_rescale: bool,
Expand Down Expand Up @@ -673,7 +648,6 @@ def _validate_input_arguments(
if do_normalize and None in (image_mean, image_std):
raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")

# Copied from transformers.models.detr.image_processing_detr_fast.DetrImageProcessorFast.preprocess
def preprocess(
self,
images: ImageInput,
Expand Down Expand Up @@ -874,7 +848,7 @@ def preprocess(
processed_annotations = []
pixel_masks = [] # Initialize pixel_masks here
for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
# prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
# prepare (COCO annotations as a list of Dict -> DEFORMABLE_DETR target as a single Dict per image)
if annotations is not None:
annotation = self.prepare_annotation(
image,
Expand Down Expand Up @@ -950,7 +924,6 @@ def preprocess(
]
return encoded_inputs

# Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process
def post_process(self, outputs, target_sizes):
"""
Converts the raw output of [`DeformableDetrForObjectDetection`] into final bounding boxes in (top_left_x,
Expand Down Expand Up @@ -996,7 +969,6 @@ def post_process(self, outputs, target_sizes):

return results

# Copied from transformers.models.deformable_detr.image_processing_deformable_detr.DeformableDetrImageProcessor.post_process_object_detection
def post_process_object_detection(
self, outputs, threshold: float = 0.5, target_sizes: Union[TensorType, List[Tuple]] = None, top_k: int = 100
):
Expand Down
Loading