From bf646fbf2d29ec78b96e32d46c020ccbfb2249dc Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Fri, 17 May 2024 16:21:26 +0100 Subject: [PATCH] Add fixed resize and pad strategy for object detection (#30742) * Add resize and pad strategy * Merge get_size functions * Add pad_size + tests to object detection models * Fixup * Update docstrings * Fixup --- src/transformers/image_processing_utils.py | 8 +- .../image_processing_conditional_detr.py | 123 ++++++++++++++--- .../image_processing_deformable_detr.py | 123 ++++++++++++++--- .../models/deta/image_processing_deta.py | 116 ++++++++++++++-- .../models/detr/image_processing_detr.py | 120 +++++++++++++++-- .../image_processing_grounding_dino.py | 124 +++++++++++++++--- .../models/yolos/image_processing_yolos.py | 123 ++++++++++++++--- .../test_image_processing_conditional_detr.py | 47 +++++++ .../test_image_processing_deformable_detr.py | 47 +++++++ .../models/deta/test_image_processing_deta.py | 47 +++++++ .../models/detr/test_image_processing_detr.py | 46 +++++++ .../test_image_processing_grounding_dino.py | 47 +++++++ .../yolos/test_image_processing_yolos.py | 47 +++++++ 13 files changed, 929 insertions(+), 89 deletions(-) diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index e040cdd31aa04b..efd1e04a62106f 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -662,7 +662,13 @@ def center_crop( ) -VALID_SIZE_DICT_KEYS = ({"height", "width"}, {"shortest_edge"}, {"shortest_edge", "longest_edge"}, {"longest_edge"}) +VALID_SIZE_DICT_KEYS = ( + {"height", "width"}, + {"shortest_edge"}, + {"shortest_edge", "longest_edge"}, + {"longest_edge"}, + {"max_height", "max_width"}, +) def is_valid_size_dict(size_dict): diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py index 9aab526c09a40d..46a96a76cf4153 100644 --- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py +++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py @@ -147,6 +147,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -768,8 +804,16 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -793,8 +837,13 @@ class ConditionalDetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -813,6 +862,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -846,6 +896,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -861,6 +912,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -933,8 +985,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -953,18 +1012,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1108,6 +1176,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1137,8 +1206,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1146,7 +1223,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1160,7 +1237,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1195,6 +1272,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1222,7 +1300,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1240,8 +1326,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1257,6 +1344,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1286,6 +1377,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1410,6 +1502,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py index e6587b7fb59a71..f1ce6797e8f798 100644 --- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py +++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py @@ -145,6 +145,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -766,8 +802,16 @@ class DeformableDetrImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -791,8 +835,13 @@ class DeformableDetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -811,6 +860,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -844,6 +894,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -859,6 +910,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -931,8 +983,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -951,18 +1010,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1106,6 +1174,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1135,8 +1204,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1144,7 +1221,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1158,7 +1235,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1193,6 +1270,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1220,7 +1298,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1238,8 +1324,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1255,6 +1342,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1284,6 +1375,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1408,6 +1500,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py index fd563d3009c883..a73eedba2c57eb 100644 --- a/src/transformers/models/deta/image_processing_deta.py +++ b/src/transformers/models/deta/image_processing_deta.py @@ -139,6 +139,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -475,8 +511,16 @@ class DetaImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -500,8 +544,13 @@ class DetaImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -519,6 +568,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: bool = True, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -542,6 +592,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->DETA def prepare_annotation( @@ -593,7 +644,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - The desired output size. Can contain keys `shortest_edge` and `longest_edge` or `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`ChannelDimension`, *optional*): @@ -605,18 +664,22 @@ def resize( """ size = get_size_dict(size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format + image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format ) return image @@ -760,6 +823,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -789,8 +853,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -798,7 +870,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -812,7 +884,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -846,6 +918,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -873,7 +946,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -891,8 +972,9 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -908,6 +990,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -929,6 +1015,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. @@ -1051,6 +1138,7 @@ def preprocess( input_data_format=input_data_format, return_tensors=return_tensors, update_bboxes=do_convert_annotations, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py index be4b0dec9f0a5f..e6c2ee16a8570b 100644 --- a/src/transformers/models/detr/image_processing_detr.py +++ b/src/transformers/models/detr/image_processing_detr.py @@ -116,6 +116,41 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in return (oh, ow) +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + def get_resize_output_image_size( input_image: np.ndarray, size: Union[int, Tuple[int, int], List[int]], @@ -753,7 +788,15 @@ class DetrImageProcessor(BaseImageProcessor): overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter - in the `preprocess` method. + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -777,8 +820,13 @@ class DetrImageProcessor(BaseImageProcessor): Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -796,6 +844,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -829,6 +878,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -844,6 +894,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -913,8 +964,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -933,18 +991,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1083,6 +1150,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1112,8 +1180,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1121,7 +1197,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1135,7 +1211,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1169,6 +1245,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1196,7 +1273,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1214,8 +1299,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1231,6 +1317,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1260,6 +1350,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1384,6 +1475,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py index 0d7eebb7b3ea12..08a5a70bf43c2c 100644 --- a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -152,6 +152,42 @@ def get_resize_output_image_size( return get_size_with_aspect_ratio(image_size, size, max_size) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn def get_numpy_to_framework_fn(arr) -> Callable: """ @@ -773,8 +809,16 @@ class GroundingDinoImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -798,8 +842,14 @@ class GroundingDinoImageProcessor(BaseImageProcessor): bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): - Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be - overridden by the `do_pad` parameter in the `preprocess` method. + Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -818,6 +868,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -851,6 +902,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -866,6 +918,7 @@ def __init__( "image_mean", "image_std", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -938,8 +991,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -958,18 +1018,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1113,6 +1182,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1142,8 +1212,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1151,7 +1229,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1165,7 +1243,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1200,6 +1278,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1227,7 +1306,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1245,8 +1332,9 @@ def preprocess( image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): Standard deviation to use when normalizing the image. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1262,6 +1350,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1291,6 +1383,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format images = make_list_of_images(images) @@ -1415,6 +1508,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py index 1c35fc291226ce..669dab238e721c 100644 --- a/src/transformers/models/yolos/image_processing_yolos.py +++ b/src/transformers/models/yolos/image_processing_yolos.py @@ -133,6 +133,42 @@ def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, in return (height, width) +# Copied from transformers.models.detr.image_processing_detr.get_image_size_for_max_height_width +def get_image_size_for_max_height_width( + input_image: np.ndarray, + max_height: int, + max_width: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio. + Important, even if image_height < max_height and image_width < max_width, the image will be resized + to at least one of the edges be equal to max_height or max_width. + + For example: + - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50) + - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400) + + Args: + input_image (`np.ndarray`): + The image to resize. + max_height (`int`): + The maximum allowed height. + max_width (`int`): + The maximum allowed width. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + height, width = image_size + height_scale = max_height / height + width_scale = max_width / width + min_scale = min(height_scale, width_scale) + new_height = int(height * min_scale) + new_width = int(width * min_scale) + return new_height, new_width + + # Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size def get_resize_output_image_size( input_image: np.ndarray, @@ -678,8 +714,16 @@ class YolosImageProcessor(BaseImageProcessor): Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): - Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in - the `preprocess` method. + Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter + in the `preprocess` method. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. do_rescale (`bool`, *optional*, defaults to `True`): @@ -699,8 +743,13 @@ class YolosImageProcessor(BaseImageProcessor): for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. do_pad (`bool`, *optional*, defaults to `True`): Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` - method. If `True` will pad the images in the batch to the largest height and width in the batch. - Padding will be applied to the bottom and right of the image with zeros. + method. If `True`, padding will be applied to the bottom and right of the image with zeros. + If `pad_size` is provided, the image will be padded to the specified dimensions. + Otherwise, the image will be padded to the maximum height and width of the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ model_input_names = ["pixel_values", "pixel_mask"] @@ -718,6 +767,7 @@ def __init__( image_std: Union[float, List[float]] = None, do_convert_annotations: Optional[bool] = None, do_pad: bool = True, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> None: if "pad_and_return_pixel_mask" in kwargs: @@ -751,6 +801,7 @@ def __init__( self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD self.do_pad = do_pad + self.pad_size = pad_size self._valid_processor_keys = [ "images", "annotations", @@ -766,6 +817,7 @@ def __init__( "image_std", "do_convert_annotations", "do_pad", + "pad_size", "format", "return_tensors", "data_format", @@ -838,8 +890,15 @@ def resize( image (`np.ndarray`): Image to resize. size (`Dict[str, int]`): - Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or - `height` and `width`. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): Resampling filter to use if resizing the image. data_format (`str` or `ChannelDimension`, *optional*): @@ -858,18 +917,27 @@ def resize( max_size = None size = get_size_dict(size, max_size=max_size, default_to_square=False) if "shortest_edge" in size and "longest_edge" in size: - size = get_resize_output_image_size( + new_size = get_resize_output_image_size( image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format ) + elif "max_height" in size and "max_width" in size: + new_size = get_image_size_for_max_height_width( + image, size["max_height"], size["max_width"], input_data_format=input_data_format + ) elif "height" in size and "width" in size: - size = (size["height"], size["width"]) + new_size = (size["height"], size["width"]) else: raise ValueError( "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" f" {size.keys()}." ) image = resize( - image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + image, + size=new_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, ) return image @@ -1012,6 +1080,7 @@ def pad( data_format: Optional[ChannelDimension] = None, input_data_format: Optional[Union[str, ChannelDimension]] = None, update_bboxes: bool = True, + pad_size: Optional[Dict[str, int]] = None, ) -> BatchFeature: """ Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width @@ -1042,8 +1111,16 @@ def pad( Whether to update the bounding boxes in the annotations to match the padded images. If the bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` format, the bounding boxes will not be updated. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ - pad_size = get_max_height_width(images, input_data_format=input_data_format) + pad_size = pad_size if pad_size is not None else self.pad_size + if pad_size is not None: + padded_size = (pad_size["height"], pad_size["width"]) + else: + padded_size = get_max_height_width(images, input_data_format=input_data_format) annotation_list = annotations if annotations is not None else [None] * len(images) padded_images = [] @@ -1051,7 +1128,7 @@ def pad( for image, annotation in zip(images, annotation_list): padded_image, padded_annotation = self._pad_image( image, - pad_size, + padded_size, annotation, constant_values=constant_values, data_format=data_format, @@ -1065,7 +1142,7 @@ def pad( if return_pixel_mask: masks = [ - make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format) for image in images ] data["pixel_mask"] = masks @@ -1099,6 +1176,7 @@ def preprocess( return_tensors: Optional[Union[TensorType, str]] = None, data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_size: Optional[Dict[str, int]] = None, **kwargs, ) -> BatchFeature: """ @@ -1126,7 +1204,15 @@ def preprocess( do_resize (`bool`, *optional*, defaults to self.do_resize): Whether to resize the image. size (`Dict[str, int]`, *optional*, defaults to self.size): - Size of the image after resizing. + Size of the image's `(height, width)` dimensions after resizing. Available options are: + - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`. + Do NOT keep the aspect ratio. + - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting + the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge + less or equal to `longest_edge`. + - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the + aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to + `max_width`. resample (`PILImageResampling`, *optional*, defaults to self.resample): Resampling filter to use when resizing the image. do_rescale (`bool`, *optional*, defaults to self.do_rescale): @@ -1144,8 +1230,9 @@ def preprocess( boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` and in relative coordinates. do_pad (`bool`, *optional*, defaults to self.do_pad): - Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch - and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + Whether to pad the image. If `True`, padding will be applied to the bottom and right of + the image with zeros. If `pad_size` is provided, the image will be padded to the specified + dimensions. Otherwise, the image will be padded to the maximum height and width of the batch. format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): Format of the annotations. return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): @@ -1158,6 +1245,10 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size + provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest + height and width in the batch. """ if "pad_and_return_pixel_mask" in kwargs: logger.warning_once( @@ -1187,6 +1278,7 @@ def preprocess( self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations ) do_pad = self.do_pad if do_pad is None else do_pad + pad_size = self.pad_size if pad_size is None else pad_size format = self.format if format is None else format validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) @@ -1310,6 +1402,7 @@ def preprocess( input_data_format=input_data_format, update_bboxes=do_convert_annotations, return_tensors=return_tensors, + pad_size=pad_size, ) else: images = [ diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py index e340f4247d47df..61dcdc873dc39e 100644 --- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py +++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py @@ -490,3 +490,50 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->ConditionalDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = ConditionalDetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py index 50df72496ffc3e..49139c753938d0 100644 --- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py +++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py @@ -492,3 +492,50 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->DeformableDetr + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DeformableDetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py index ad17f0b5a17809..3ea5885b0e0921 100644 --- a/tests/models/deta/test_image_processing_deta.py +++ b/tests/models/deta/test_image_processing_deta.py @@ -486,3 +486,50 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Deta + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DetaImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DetaImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DetaImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DetaImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DetaImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py index c79c1d7b01962a..ede06be6c521e9 100644 --- a/tests/models/detr/test_image_processing_detr.py +++ b/tests/models/detr/test_image_processing_detr.py @@ -547,3 +547,49 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = DetrImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = DetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = DetrImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = DetrImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = DetrImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py index df69784bbb4523..5cd09ce23816b0 100644 --- a/tests/models/grounding_dino/test_image_processing_grounding_dino.py +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -528,3 +528,50 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->GroundingDino + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = GroundingDinoImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100])) diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py index f7465779b59461..f04015ac0c9b22 100644 --- a/tests/models/yolos/test_image_processing_yolos.py +++ b/tests/models/yolos/test_image_processing_yolos.py @@ -546,3 +546,50 @@ def test_batched_coco_panoptic_annotations(self): ).T self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_max_width_max_height_resizing_and_pad_strategy with Detr->Yolos + def test_max_width_max_height_resizing_and_pad_strategy(self): + image_1 = torch.ones([200, 100, 3], dtype=torch.uint8) + + # do_pad=False, max_height=100, max_width=100, image=200x100 -> 100x50 + image_processor = YolosImageProcessor( + size={"max_height": 100, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 50])) + + # do_pad=False, max_height=300, max_width=100, image=200x100 -> 200x100 + image_processor = YolosImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=False, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + + # do_pad=True, max_height=100, max_width=100, image=200x100 -> 100x100 + image_processor = YolosImageProcessor( + size={"max_height": 100, "max_width": 100}, do_pad=True, pad_size={"height": 100, "width": 100} + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 100, 100])) + + # do_pad=True, max_height=300, max_width=100, image=200x100 -> 300x100 + image_processor = YolosImageProcessor( + size={"max_height": 300, "max_width": 100}, + do_pad=True, + pad_size={"height": 301, "width": 101}, + ) + inputs = image_processor(images=[image_1], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([1, 3, 301, 101])) + + ### Check for batch + image_2 = torch.ones([100, 150, 3], dtype=torch.uint8) + + # do_pad=True, max_height=150, max_width=100, images=[200x100, 100x150] -> 150x100 + image_processor = YolosImageProcessor( + size={"max_height": 150, "max_width": 100}, + do_pad=True, + pad_size={"height": 150, "width": 100}, + ) + inputs = image_processor(images=[image_1, image_2], return_tensors="pt") + self.assertEqual(inputs["pixel_values"].shape, torch.Size([2, 3, 150, 100]))