diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py index a2004a8b55931e..70f1a339de706a 100644 --- a/src/transformers/image_processing_utils.py +++ b/src/transformers/image_processing_utils.py @@ -748,6 +748,44 @@ def get_size_dict( return size_dict +def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple: + """ + Selects the best resolution from a list of possible resolutions based on the original size. + + This is done by calculating the effective and wasted resolution for each possible resolution. + + The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution. + + Args: + original_size (tuple): + The original size of the image in the format (height, width). + possible_resolutions (list): + A list of possible resolutions in the format [(height1, width1), (height2, width2), ...]. + + Returns: + tuple: The best fit resolution in the format (height, width). + """ + original_height, original_width = original_size + best_fit = None + max_effective_resolution = 0 + min_wasted_resolution = float("inf") + + for height, width in possible_resolutions: + scale = min(width / original_width, height / original_height) + downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) + effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) + wasted_resolution = (width * height) - effective_resolution + + if effective_resolution > max_effective_resolution or ( + effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution + ): + max_effective_resolution = effective_resolution + min_wasted_resolution = wasted_resolution + best_fit = (height, width) + + return best_fit + + ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub) if ImageProcessingMixin.push_to_hub.__doc__ is not None: ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format( diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 582e43f59c74ed..3debf97fea2081 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -77,7 +77,7 @@ ("layoutlmv3", "LayoutLMv3ImageProcessor"), ("levit", "LevitImageProcessor"), ("llava", "CLIPImageProcessor"), - ("llava_next", "CLIPImageProcessor"), + ("llava_next", "LlavaNextImageProcessor"), ("mask2former", "Mask2FormerImageProcessor"), ("maskformer", "MaskFormerImageProcessor"), ("mgp-str", "ViTImageProcessor"), diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py index 631fd1223c2944..3934927a2e7957 100644 --- a/src/transformers/models/llava_next/image_processing_llava_next.py +++ b/src/transformers/models/llava_next/image_processing_llava_next.py @@ -19,7 +19,7 @@ import numpy as np -from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution from ...image_transforms import ( convert_to_rgb, get_resize_output_image_size, @@ -51,44 +51,6 @@ from PIL import Image -def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple: - """ - Selects the best resolution from a list of possible resolutions based on the original size. - - This is done by calculating the effective and wasted resolution for each possible resolution. - - The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution. - - Args: - original_size (tuple): - The original size of the image in the format (height, width). - possible_resolutions (list): - A list of possible resolutions in the format [(height1, width1), (height2, width2), ...]. - - Returns: - tuple: The best fit resolution in the format (height, width). - """ - original_height, original_width = original_size - best_fit = None - max_effective_resolution = 0 - min_wasted_resolution = float("inf") - - for height, width in possible_resolutions: - scale = min(width / original_width, height / original_height) - downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale) - effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height) - wasted_resolution = (width * height) - effective_resolution - - if effective_resolution > max_effective_resolution or ( - effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution - ): - max_effective_resolution = effective_resolution - min_wasted_resolution = wasted_resolution - best_fit = (height, width) - - return best_fit - - def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: """ Divides an image into patches of a specified size. diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 10f82eb45ad007..845269830c533c 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -24,6 +24,7 @@ from ... import PreTrainedModel from ...activations import ACT2FN from ...cache_utils import Cache +from ...image_processing_utils import select_best_resolution from ...modeling_outputs import ModelOutput from ...utils import ( add_start_docstrings, @@ -33,7 +34,6 @@ ) from ..auto import AutoModel, AutoModelForCausalLM from .configuration_llava_next import LlavaNextConfig -from .image_processing_llava_next import select_best_resolution logger = logging.get_logger(__name__)