From 42ecf48d9093c292131ae5f57e29d525c6790795 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 11:02:14 +0000 Subject: [PATCH 01/29] expand kwargs from align --- .../models/align/processing_align.py | 88 +++++++++++++++++-- 1 file changed, 82 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 0863c11310e318..ab8671c495fb4d 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -17,8 +17,11 @@ """ +from typing import Dict, List, Optional, Union + from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import PaddingStrategy, TensorType class AlignProcessor(ProcessorMixin): @@ -42,11 +45,49 @@ class AlignProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs): + def __call__( + self, + text=None, + images=None, + do_crop_margin: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: "PILImageResampling" = None, # noqa: F821 + do_thumbnail: bool = None, + do_align_long_axis: bool = None, + do_pad: bool = None, + do_rescale: bool = None, + rescale_factor: Union[int, float] = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 + input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = "max_length", + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = 64, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + ): """ Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text` - and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to + arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` arguments to EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring of the above two methods for more information. @@ -87,11 +128,46 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64, if text is not None: encoding = self.tokenizer( - text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs + text, + text_pair=text_pair, + text_target=text_target, + text_pair_target=text_pair_target, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, ) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor( + images, + do_crop_margin=do_crop_margin, + do_resize=do_resize, + size=size, + resample=resample, + do_thumbnail=do_thumbnail, + do_align_long_axis=do_align_long_axis, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + input_data_format=input_data_format, + ) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values From ccb214774e9e7653d7af540f0447286b17ea8e60 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 11:16:22 +0000 Subject: [PATCH 02/29] remove kwargs from altclip processor --- .../models/altclip/processing_altclip.py | 94 +++++++++++++++++-- 1 file changed, 85 insertions(+), 9 deletions(-) diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index e9b4f45269ca76..8388b9c416e79e 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -16,9 +16,11 @@ Image/Text processor class for AltCLIP """ import warnings +from typing import Dict, List, Optional, Union from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import PaddingStrategy, TensorType class AltCLIPProcessor(ProcessorMixin): @@ -34,22 +36,21 @@ class AltCLIPProcessor(ProcessorMixin): The image processor is a required input. tokenizer ([`XLMRobertaTokenizerFast`], *optional*): The tokenizer is a required input. + feature_extractor ([`CLIPFeatureExtractor`], *optional*): + The feature extractor is a deprecated input. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "CLIPImageProcessor" tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast") - def __init__(self, image_processor=None, tokenizer=None, **kwargs): - feature_extractor = None - if "feature_extractor" in kwargs: + def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None): + if "feature_extractor": warnings.warn( "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" " instead.", FutureWarning, ) - feature_extractor = kwargs.pop("feature_extractor") - image_processor = image_processor if image_processor is not None else feature_extractor if image_processor is None: raise ValueError("You need to specify an `image_processor`.") @@ -58,7 +59,45 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): super().__init__(image_processor, tokenizer) - def __call__(self, text=None, images=None, return_tensors=None, **kwargs): + def __call__( + self, + text=None, + images=None, + do_crop_margin: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: "PILImageResampling" = None, # noqa: F821 + do_thumbnail: bool = None, + do_align_long_axis: bool = None, + do_pad: bool = None, + do_rescale: bool = None, + rescale_factor: Union[int, float] = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 + input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + ): """ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not @@ -98,10 +137,47 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs): raise ValueError("You have to specify either text or images. Both cannot be none.") if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer( + text, + text_pair=text_pair, + text_target=text_target, + text_pair_target=text_pair_target, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + ) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor( + images, + do_crop_margin=do_crop_margin, + do_resize=do_resize, + size=size, + resample=resample, + do_thumbnail=do_thumbnail, + do_align_long_axis=do_align_long_axis, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + input_data_format=input_data_format, + ) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values From f999e0c40f144621e0051a96c3a615ede062ec17 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 13:04:46 +0000 Subject: [PATCH 03/29] add explicit args for donut processor --- .../models/donut/image_processing_donut.py | 12 +++- .../models/donut/processing_donut.py | 62 +++++++++++++++---- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index 2a1672e22041fb..6298a65d70aaf1 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -84,6 +84,13 @@ class DonutImageProcessor(BaseImageProcessor): channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): Image standard deviation. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. """ model_input_names = ["pixel_values"] @@ -101,7 +108,9 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - **kwargs, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs + ) -> None: super().__init__(**kwargs) @@ -122,6 +131,7 @@ def __init__( self.do_normalize = do_normalize self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.return_tensors = return_tensors def align_long_axis( self, diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 5636ecb9435cf3..91de6768f8b782 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -19,8 +19,12 @@ import warnings from contextlib import contextmanager -from ...processing_utils import ProcessorMixin +from typing import Dict, List, Optional, Union + +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy +from ...processing_utils import ProcessorMixin +from ...utils import PaddingStrategy, TensorType class DonutProcessor(ProcessorMixin): r""" @@ -36,21 +40,21 @@ class DonutProcessor(ProcessorMixin): An instance of [`DonutImageProcessor`]. The image processor is a required input. tokenizer ([`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`], *optional*): An instance of [`XLMRobertaTokenizer`/`XLMRobertaTokenizerFast`]. The tokenizer is a required input. + feature_extractor ([`CLIPFeatureExtractor`], *optional*): + The feature extractor is a deprecated input. """ attributes = ["image_processor", "tokenizer"] image_processor_class = "AutoImageProcessor" tokenizer_class = "AutoTokenizer" - def __init__(self, image_processor=None, tokenizer=None, **kwargs): - feature_extractor = None - if "feature_extractor" in kwargs: + def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None): + if "feature_extractor": warnings.warn( "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`" " instead.", FutureWarning, ) - feature_extractor = kwargs.pop("feature_extractor") image_processor = image_processor if image_processor is not None else feature_extractor if image_processor is None: @@ -62,22 +66,54 @@ def __init__(self, image_processor=None, tokenizer=None, **kwargs): self.current_processor = self.image_processor self._in_target_context_manager = False - def __call__(self, *args, **kwargs): + def __call__(self, + text=None, + images=None, + do_crop_margin: bool = None, + do_resize: bool = None, + size: Dict[str, int] = None, + resample: "PILImageResampling" = None, # noqa: F821 + do_thumbnail: bool = None, + do_align_long_axis: bool = None, + do_pad: bool = None, + do_rescale: bool = None, + rescale_factor: Union[int, float] = None, + do_normalize: bool = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 + input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + ): """ When used in normal mode, this method forwards all its arguments to AutoImageProcessor's [`~AutoImageProcessor.__call__`] and returns its output. If used in the context [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's - [`~DonutTokenizer.__call__`]. Please refer to the doctsring of the above two methods for more information. + [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information. """ # For backward compatibility if self._in_target_context_manager: - return self.current_processor(*args, **kwargs) + return self.current_processor(*args) - images = kwargs.pop("images", None) - text = kwargs.pop("text", None) - if len(args) > 0: - images = args[0] - args = args[1:] if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") From 8fb3a6bf0d6b5c498d4d36be0155988b9c2c9c0e Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 13:20:04 +0000 Subject: [PATCH 04/29] add explicit call to current processor for in context manager --- .../models/donut/processing_donut.py | 56 ++++++++++++++++++- 1 file changed, 53 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 91de6768f8b782..8c31fe88127aa4 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -112,16 +112,66 @@ def __call__(self, """ # For backward compatibility if self._in_target_context_manager: - return self.current_processor(*args) + return self.current_processor( + images, + do_crop_margin=do_crop_margin, + do_resize=do_resize, + size=size, + resample=resample, + do_thumbnail=do_thumbnail, + do_align_long_axis=do_align_long_axis, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + input_data_format=input_data_format + ) if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") if images is not None: - inputs = self.image_processor(images, *args, **kwargs) + inputs = self.image_processor(images, + do_crop_margin=do_crop_margin, + do_resize=do_resize, + size=size, + resample=resample, + do_thumbnail=do_thumbnail, + do_align_long_axis=do_align_long_axis, + do_pad=do_pad, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + return_tensors=return_tensors, + data_format=data_format, + input_data_format=input_data_format,) if text is not None: - encodings = self.tokenizer(text, **kwargs) + encodings = self.tokenizer(text, + text_pair=text_pair, + text_target=text_target, + text_pair_target=text_pair_target, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose,) if text is None: return inputs From a90c766d002e074311a80ba06c26af75d3c3ff58 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 15:06:04 +0000 Subject: [PATCH 05/29] format --- .../models/donut/image_processing_donut.py | 3 +-- .../models/donut/processing_donut.py | 24 +++++++++++-------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/donut/image_processing_donut.py b/src/transformers/models/donut/image_processing_donut.py index 6298a65d70aaf1..7b24e3fe7e3b71 100644 --- a/src/transformers/models/donut/image_processing_donut.py +++ b/src/transformers/models/donut/image_processing_donut.py @@ -109,8 +109,7 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs - + **kwargs, ) -> None: super().__init__(**kwargs) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 8c31fe88127aa4..ae2588b85e2040 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -18,7 +18,6 @@ import re import warnings from contextlib import contextmanager - from typing import Dict, List, Optional, Union from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy @@ -26,6 +25,7 @@ from ...processing_utils import ProcessorMixin from ...utils import PaddingStrategy, TensorType + class DonutProcessor(ProcessorMixin): r""" Constructs a Donut processor which wraps a Donut image processor and an XLMRoBERTa tokenizer into a single @@ -66,7 +66,8 @@ def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None) self.current_processor = self.image_processor self._in_target_context_manager = False - def __call__(self, + def __call__( + self, text=None, images=None, do_crop_margin: bool = None, @@ -103,7 +104,7 @@ def __call__(self, return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - ): + ): """ When used in normal mode, this method forwards all its arguments to AutoImageProcessor's [`~AutoImageProcessor.__call__`] and returns its output. If used in the context @@ -113,7 +114,7 @@ def __call__(self, # For backward compatibility if self._in_target_context_manager: return self.current_processor( - images, + images, do_crop_margin=do_crop_margin, do_resize=do_resize, size=size, @@ -128,15 +129,15 @@ def __call__(self, image_std=image_std, return_tensors=return_tensors, data_format=data_format, - input_data_format=input_data_format + input_data_format=input_data_format, ) - if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") if images is not None: - inputs = self.image_processor(images, + inputs = self.image_processor( + images, do_crop_margin=do_crop_margin, do_resize=do_resize, size=size, @@ -151,9 +152,11 @@ def __call__(self, image_std=image_std, return_tensors=return_tensors, data_format=data_format, - input_data_format=input_data_format,) + input_data_format=input_data_format, + ) if text is not None: - encodings = self.tokenizer(text, + encodings = self.tokenizer( + text, text_pair=text_pair, text_target=text_target, text_pair_target=text_pair_target, @@ -171,7 +174,8 @@ def __call__(self, return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, return_length=return_length, - verbose=verbose,) + verbose=verbose, + ) if text is None: return inputs From 49cb6cc44f5102b8f59889b2b9f808c3488e6639 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 15:07:27 +0000 Subject: [PATCH 06/29] remove unused kwargs --- src/transformers/models/bit/image_processing_bit.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py index 7aa49145ae0527..6676cd4d37c822 100644 --- a/src/transformers/models/bit/image_processing_bit.py +++ b/src/transformers/models/bit/image_processing_bit.py @@ -188,7 +188,6 @@ def preprocess( return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. From 3ac1c7ed66c62c82a501d7c1f13b3aa9db69af1e Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 15:40:47 +0000 Subject: [PATCH 07/29] move conditions for encodings --- .../models/blip/processing_blip.py | 26 +++---------------- .../models/blip_2/processing_blip_2.py | 26 +++---------------- 2 files changed, 8 insertions(+), 44 deletions(-) diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 3b9d5c369a4412..8919bf2aefcdcd 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -79,6 +79,10 @@ def __call__( # Get only text if images is None: self.current_processor = self.tokenizer + if text is None: + text_encoding = None + + if images is None or text is not None: text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -102,28 +106,6 @@ def __call__( # add pixel_values encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) - if text is not None: - text_encoding = self.tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, - return_tensors=return_tensors, - **kwargs, - ) - else: - text_encoding = None - if text_encoding is not None: encoding_image_processor.update(text_encoding) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index ff7044c82aedb6..f1df2229d9c8b5 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -81,6 +81,10 @@ def __call__( # Get only text if images is None: self.current_processor = self.tokenizer + if text is None: + text_encoding = None + + if images is None or text is not None: text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -104,28 +108,6 @@ def __call__( # add pixel_values encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) - if text is not None: - text_encoding = self.tokenizer( - text=text, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - pad_to_multiple_of=pad_to_multiple_of, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_token_type_ids=return_token_type_ids, - return_length=return_length, - verbose=verbose, - return_tensors=return_tensors, - **kwargs, - ) - else: - text_encoding = None - if text_encoding is not None: encoding_image_processor.update(text_encoding) From 7a819fd77e3b2f860ee7fadc3519eae88ec37bf2 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 16:00:59 +0000 Subject: [PATCH 08/29] improve flow over text/image --- .../models/blip/processing_blip.py | 26 +++++++++---------- .../models/blip_2/processing_blip_2.py | 26 +++++++++---------- 2 files changed, 24 insertions(+), 28 deletions(-) diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 8919bf2aefcdcd..713d1b8c194b82 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -76,13 +76,10 @@ def __call__( if images is None and text is None: raise ValueError("You have to specify either images or text.") - # Get only text - if images is None: - self.current_processor = self.tokenizer - if text is None: - text_encoding = None + text_encoding = None - if images is None or text is not None: + if text is not None: + self.current_processor = self.tokenizer text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -101,15 +98,16 @@ def __call__( return_tensors=return_tensors, **kwargs, ) - return text_encoding - - # add pixel_values - encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) - - if text_encoding is not None: - encoding_image_processor.update(text_encoding) - return encoding_image_processor + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + if images is not None: + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + return encoding_image_processor + + return text_encoding def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index f1df2229d9c8b5..54a4bdb3bc982f 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -78,13 +78,10 @@ def __call__( if images is None and text is None: raise ValueError("You have to specify either images or text.") - # Get only text - if images is None: + text_encoding = None + + if text is not None: self.current_processor = self.tokenizer - if text is None: - text_encoding = None - - if images is None or text is not None: text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, @@ -103,15 +100,16 @@ def __call__( return_tensors=return_tensors, **kwargs, ) - return text_encoding - - # add pixel_values - encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) - - if text_encoding is not None: - encoding_image_processor.update(text_encoding) - return encoding_image_processor + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + if images is not None: + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + return encoding_image_processor + + return text_encoding # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer def batch_decode(self, *args, **kwargs): From 9cc38b78c41521afa4df47a325bcf284256298ef Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Jan 2024 17:37:23 +0000 Subject: [PATCH 09/29] [breaking] pass explicit args to bridgetower --- .../image_processing_bridgetower.py | 12 +++--- .../bridgetower/processing_bridgetower.py | 39 +++++++++++++++---- 2 files changed, 39 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 1e2b8ea40b0703..0ffdf27e5931cf 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -161,6 +161,9 @@ class BridgeTowerImageProcessor(BaseImageProcessor): do_pad (`bool`, *optional*, defaults to `True`): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by the `do_pad` parameter in the `preprocess` method. + pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`): + Deprecated. Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + Sets do_pad. """ model_input_names = ["pixel_values"] @@ -178,12 +181,12 @@ def __init__( image_std: Optional[Union[float, List[float]]] = None, do_center_crop: bool = True, do_pad: bool = True, - **kwargs, + pad_and_return_pixel_mask: bool = None, + max_text_len: int = 512 # TODO (Molbap): find original values in model config ) -> None: - if "pad_and_return_pixel_mask" in kwargs: - do_pad = kwargs.pop("pad_and_return_pixel_mask") + if pad_and_return_pixel_mask: + do_pad = pad_and_return_pixel_mask - super().__init__(**kwargs) size = size if size is not None else {"shortest_edge": 288} size = get_size_dict(size, default_to_square=False) @@ -381,7 +384,6 @@ def preprocess( return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, - **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 7718c3bf833fec..e73a64db98d26e 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -16,7 +16,7 @@ Processor class for BridgeTower. """ -from typing import List, Optional, Union +from typing import List, Optional, Union, Dict from ...processing_utils import ProcessorMixin from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy @@ -48,13 +48,27 @@ def __init__(self, image_processor, tokenizer): def __call__( self, - images, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + images=None, + text=None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + size_divisor: Optional[int] = None, + resample: "PILImageResampling" = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + pad_and_return_pixel_mask: Optional[bool] = None, + do_center_crop: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = None, max_length: Optional[int] = None, stride: int = 0, + is_split_into_words: bool = False, pad_to_multiple_of: Optional[int] = None, return_token_type_ids: Optional[bool] = None, return_attention_mask: Optional[bool] = None, @@ -63,8 +77,6 @@ def __call__( return_offsets_mapping: bool = False, return_length: bool = False, verbose: bool = True, - return_tensors: Optional[Union[str, TensorType]] = None, - **kwargs, ) -> BatchEncoding: """ This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and @@ -79,6 +91,7 @@ def __call__( truncation=truncation, max_length=max_length, stride=stride, + is_split_into_words=is_split_into_words, pad_to_multiple_of=pad_to_multiple_of, return_token_type_ids=return_token_type_ids, return_attention_mask=return_attention_mask, @@ -88,11 +101,23 @@ def __call__( return_length=return_length, verbose=verbose, return_tensors=return_tensors, - **kwargs, ) # add pixel_values + pixel_mask + encoding_image_processor = self.image_processor( - images, return_tensors=return_tensors, do_normalize=True, do_center_crop=True, **kwargs + images, + do_resize=do_resize, + size=size, + size_divisor=size_divisor, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + do_pad=do_pad, + pad_and_return_pixel_mask=pad_and_return_pixel_mask ) encoding.update(encoding_image_processor) From 7db64a06cdca3e6507d5c00e2dd444788fb9fc8c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 13:23:06 +0000 Subject: [PATCH 10/29] add default kwargs for BC --- .../models/bridgetower/image_processing_bridgetower.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 0ffdf27e5931cf..75ffe4a0f8b498 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -182,7 +182,12 @@ def __init__( do_center_crop: bool = True, do_pad: bool = True, pad_and_return_pixel_mask: bool = None, - max_text_len: int = 512 # TODO (Molbap): find original values in model config + max_text_len: int = 50, # From here on, kwargs are kept for backwards compatibility to load existing processors. + cache_dir: str = "/tmp", + downstream_fusion: bool = False, + downstream_fusion_layers: int = 1, + downstream_fusion_method: str = "elmo", + ) -> None: if pad_and_return_pixel_mask: do_pad = pad_and_return_pixel_mask @@ -384,6 +389,7 @@ def preprocess( return_tensors: Optional[Union[str, TensorType]] = None, data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + pad_and_return_pixel_mask: Optional[bool] = None, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. From 41674d9cb037df747995c8b9a6c905a4f7d97df7 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 17:43:19 +0000 Subject: [PATCH 11/29] fix bridgetower --- .../image_processing_bridgetower.py | 44 ++++++++++++++----- .../bridgetower/processing_bridgetower.py | 20 ++++++--- 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 75ffe4a0f8b498..ac5ed176a9b98d 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -180,15 +180,32 @@ def __init__( image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, do_center_crop: bool = True, - do_pad: bool = True, - pad_and_return_pixel_mask: bool = None, - max_text_len: int = 50, # From here on, kwargs are kept for backwards compatibility to load existing processors. - cache_dir: str = "/tmp", - downstream_fusion: bool = False, - downstream_fusion_layers: int = 1, - downstream_fusion_method: str = "elmo", - + do_pad: Optional[bool] = True, + pad_and_return_pixel_mask: Optional[bool] = None, + return_tensors: Optional[bool] = None, + **kwargs, ) -> None: + valid_processor_keys = { + "do_resize", + "size", + "size_divisor", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_center_crop", + "do_pad", + "pad_and_return_pixel_mask", + "return_tensors" + } + + unused_keys = set(kwargs.keys()) - valid_processor_keys + if unused_keys: + unused_key_str = ", ".join(unused_keys) + logger.warning_once(f"Unused or unrecognized configuration parameters: {unused_key_str}.") + if pad_and_return_pixel_mask: do_pad = pad_and_return_pixel_mask @@ -206,6 +223,7 @@ def __init__( self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD self.do_pad = do_pad self.do_center_crop = do_center_crop + self.return_tensors = return_tensors # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize def resize( @@ -371,7 +389,7 @@ def pad( data["pixel_mask"] = masks return BatchFeature(data=data, tensor_type=return_tensors) - + def preprocess( self, images: ImageInput, @@ -443,6 +461,9 @@ def preprocess( - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + pad_and_return_pixel_mask (`bool`, *optional*, deprecated, defaults to `self.do_pad`): + Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also + created and returned. Deprecated version of do_pad. """ do_resize = do_resize if do_resize is not None else self.do_resize size_divisor = size_divisor if size_divisor is not None else self.size_divisor @@ -452,8 +473,9 @@ def preprocess( do_normalize = do_normalize if do_normalize is not None else self.do_normalize image_mean = image_mean if image_mean is not None else self.image_mean image_std = image_std if image_std is not None else self.image_std - do_pad = do_pad if do_pad is not None else self.do_pad - do_center_crop if do_center_crop is not None else self.do_center_crop + do_pad = do_pad if (do_pad is not None or pad_and_return_pixel_mask) else self.do_pad + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + return_tensors = return_tensors if return_tensors is not None else self.return_tensors size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index e73a64db98d26e..9062b80f578880 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -16,10 +16,11 @@ Processor class for BridgeTower. """ -from typing import List, Optional, Union, Dict +from typing import Dict, List, Optional, Union +from ...image_utils import ChannelDimension from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy, TextInput, PreTokenizedInput from ...utils import TensorType @@ -48,8 +49,8 @@ def __init__(self, image_processor, tokenizer): def __call__( self, - images=None, - text=None, + images, + text:Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]=None, do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, size_divisor: Optional[int] = None, @@ -63,6 +64,8 @@ def __call__( pad_and_return_pixel_mask: Optional[bool] = None, do_center_crop: Optional[bool] = None, return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, add_special_tokens: bool = True, padding: Union[bool, str, PaddingStrategy] = False, truncation: Union[bool, str, TruncationStrategy] = None, @@ -99,11 +102,11 @@ def __call__( return_special_tokens_mask=return_special_tokens_mask, return_offsets_mapping=return_offsets_mapping, return_length=return_length, - verbose=verbose, return_tensors=return_tensors, + verbose=verbose, ) # add pixel_values + pixel_mask - + print(size) encoding_image_processor = self.image_processor( images, do_resize=do_resize, @@ -116,8 +119,11 @@ def __call__( image_mean=image_mean, image_std=image_std, do_center_crop=do_center_crop, + data_format=data_format, + input_data_format=input_data_format, do_pad=do_pad, - pad_and_return_pixel_mask=pad_and_return_pixel_mask + pad_and_return_pixel_mask=pad_and_return_pixel_mask, + return_tensors=return_tensors, ) encoding.update(encoding_image_processor) From 618a687ff568ba7f9292bd54316f2b422a70bc9c Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 17:47:36 +0000 Subject: [PATCH 12/29] debug bridgetower image proc --- tests/models/bridgetower/test_image_processing_bridgetower.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index f8837fdc964a76..5665248d77af0e 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -143,6 +143,7 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): + print(self.image_processor_dict) image_processing = self.image_processing_class(**self.image_processor_dict) self.assertTrue(hasattr(image_processing, "image_mean")) self.assertTrue(hasattr(image_processing, "image_std")) From f39cdc1b76b1ae10d70ffc009dbec09ff0f4d6ef Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 17:48:11 +0000 Subject: [PATCH 13/29] format --- src/transformers/models/blip/processing_blip.py | 2 +- src/transformers/models/blip_2/processing_blip_2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 713d1b8c194b82..605a11a154d391 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -106,7 +106,7 @@ def __call__( if text_encoding is not None: encoding_image_processor.update(text_encoding) return encoding_image_processor - + return text_encoding def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 54a4bdb3bc982f..0b11db42e53095 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -108,7 +108,7 @@ def __call__( if text_encoding is not None: encoding_image_processor.update(text_encoding) return encoding_image_processor - + return text_encoding # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer From 9a6f97d8b27cba43f2ece9722f58da5cb405f6f6 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 17:58:32 +0000 Subject: [PATCH 14/29] move kwargs message to info level --- .../models/bridgetower/image_processing_bridgetower.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index ac5ed176a9b98d..4510c9b852700b 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -204,7 +204,7 @@ def __init__( unused_keys = set(kwargs.keys()) - valid_processor_keys if unused_keys: unused_key_str = ", ".join(unused_keys) - logger.warning_once(f"Unused or unrecognized configuration parameters: {unused_key_str}.") + logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") if pad_and_return_pixel_mask: do_pad = pad_and_return_pixel_mask From 380f82f515da9637374788d8bf2c62a3e34b0d54 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 18:14:32 +0000 Subject: [PATCH 15/29] add debug messages --- .../image_processing_bridgetower.py | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 4510c9b852700b..1d083d61ebaae4 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -171,7 +171,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor): def __init__( self, do_resize: bool = True, - size: Dict[str, int] = 288, + size: Dict[str, int] = {"shortest_edge": 288}, size_divisor: int = 32, resample: PILImageResampling = PILImageResampling.BICUBIC, do_rescale: bool = True, @@ -183,8 +183,12 @@ def __init__( do_pad: Optional[bool] = True, pad_and_return_pixel_mask: Optional[bool] = None, return_tensors: Optional[bool] = None, + data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 + input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 **kwargs, ) -> None: + print('DEBUG', kwargs) + valid_processor_keys = { "do_resize", "size", @@ -224,6 +228,8 @@ def __init__( self.do_pad = do_pad self.do_center_crop = do_center_crop self.return_tensors = return_tensors + self.data_format = data_format + self.input_data_format = input_data_format # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize def resize( @@ -476,10 +482,31 @@ def preprocess( do_pad = do_pad if (do_pad is not None or pad_and_return_pixel_mask) else self.do_pad do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop return_tensors = return_tensors if return_tensors is not None else self.return_tensors - + input_data_format = input_data_format if input_data_format is not None else self.input_data_format + data_format = data_format if data_format is not None else self.data_format + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + print('DEBUG') + print(f'images: {images}') + print(f'do_resize: {do_resize}') + print(f'size: {size}') + print(f'size_divisor: {size_divisor}') + print(f'resample: {resample}') + print(f'do_rescale: {do_rescale}') + print(f'rescale_factor: {rescale_factor}') + print(f'do_normalize: {do_normalize}') + print(f'image_mean: {image_mean}') + print(f'image_std: {image_std}') + print(f'do_pad: {do_pad}') + print(f'do_center_crop: {do_center_crop}') + print(f'return_tensors: {return_tensors}') + print(f'data_format: {data_format}') + print(f'input_data_format: {input_data_format}') + print(f'pad_and_return_pixel_mask: {pad_and_return_pixel_mask}') + if not is_batched(images): images = [images] From 75f15d37ef1c7fe5f70d1bf5c9ea3cd279ec7fee Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 26 Jan 2024 18:38:17 +0000 Subject: [PATCH 16/29] fix arguments not being passed in bridgetower --- .../image_processing_bridgetower.py | 21 +------------------ .../test_image_processing_bridgetower.py | 2 +- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 1d083d61ebaae4..6033eab37ba6a3 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -179,7 +179,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - do_center_crop: bool = True, + do_center_crop: Optional[bool] = None, do_pad: Optional[bool] = True, pad_and_return_pixel_mask: Optional[bool] = None, return_tensors: Optional[bool] = None, @@ -187,7 +187,6 @@ def __init__( input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 **kwargs, ) -> None: - print('DEBUG', kwargs) valid_processor_keys = { "do_resize", @@ -489,24 +488,6 @@ def preprocess( size = get_size_dict(size, default_to_square=False) - print('DEBUG') - print(f'images: {images}') - print(f'do_resize: {do_resize}') - print(f'size: {size}') - print(f'size_divisor: {size_divisor}') - print(f'resample: {resample}') - print(f'do_rescale: {do_rescale}') - print(f'rescale_factor: {rescale_factor}') - print(f'do_normalize: {do_normalize}') - print(f'image_mean: {image_mean}') - print(f'image_std: {image_std}') - print(f'do_pad: {do_pad}') - print(f'do_center_crop: {do_center_crop}') - print(f'return_tensors: {return_tensors}') - print(f'data_format: {data_format}') - print(f'input_data_format: {input_data_format}') - print(f'pad_and_return_pixel_mask: {pad_and_return_pixel_mask}') - if not is_batched(images): images = [images] diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index 5665248d77af0e..fee9041447fddd 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -39,7 +39,7 @@ def __init__( do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, - do_center_crop: bool = True, + do_center_crop: bool = False, image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073], image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711], do_pad: bool = True, From 3df5faad79e975e7e7695c9a9399677b3183ad4f Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 1 Feb 2024 17:03:04 +0000 Subject: [PATCH 17/29] keep backwards compat for processing + modify testing args dict --- .../models/bridgetower/image_processing_bridgetower.py | 2 +- tests/models/bridgetower/test_image_processing_bridgetower.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 6033eab37ba6a3..38f8ab6b2e13c9 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -179,7 +179,7 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - do_center_crop: Optional[bool] = None, + do_center_crop: Optional[bool] = True, do_pad: Optional[bool] = True, pad_and_return_pixel_mask: Optional[bool] = None, return_tensors: Optional[bool] = None, diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index fee9041447fddd..8a8a94c8ea6b69 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -70,6 +70,7 @@ def prepare_image_processor_dict(self): "image_std": self.image_std, "do_normalize": self.do_normalize, "do_resize": self.do_resize, + "do_center_crop": self.do_center_crop, "size": self.size, "size_divisor": self.size_divisor, } From 69e5a2d58f86fa7814e60f64263384f5c0edb0d9 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 1 Feb 2024 17:31:53 +0000 Subject: [PATCH 18/29] fix quality --- .../image_processing_bridgetower.py | 29 +++++++++++++++---- .../bridgetower/processing_bridgetower.py | 8 ++--- .../test_image_processing_bridgetower.py | 1 - 3 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 38f8ab6b2e13c9..dce25aab84fa28 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -128,7 +128,7 @@ class BridgeTowerImageProcessor(BaseImageProcessor): do_resize (`bool`, *optional*, defaults to `True`): Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the `do_resize` parameter in the `preprocess` method. - size (`Dict[str, int]` *optional*, defaults to 288): + size (`Dict[str, int]` *optional*, defaults to `{'shortest_edge': 288}`): Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method. @@ -161,9 +161,27 @@ class BridgeTowerImageProcessor(BaseImageProcessor): do_pad (`bool`, *optional*, defaults to `True`): Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by the `do_pad` parameter in the `preprocess` method. - pad_and_return_pixel_mask (`bool`, *optional*, defaults to `True`): + pad_and_return_pixel_mask (`bool`, *optional*): Deprecated. Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Sets do_pad. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `"channels_first"`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. """ model_input_names = ["pixel_values"] @@ -187,7 +205,6 @@ def __init__( input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 **kwargs, ) -> None: - valid_processor_keys = { "do_resize", "size", @@ -201,7 +218,7 @@ def __init__( "do_center_crop", "do_pad", "pad_and_return_pixel_mask", - "return_tensors" + "return_tensors", } unused_keys = set(kwargs.keys()) - valid_processor_keys @@ -394,7 +411,7 @@ def pad( data["pixel_mask"] = masks return BatchFeature(data=data, tensor_type=return_tensors) - + def preprocess( self, images: ImageInput, @@ -483,7 +500,7 @@ def preprocess( return_tensors = return_tensors if return_tensors is not None else self.return_tensors input_data_format = input_data_format if input_data_format is not None else self.input_data_format data_format = data_format if data_format is not None else self.data_format - + size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 9062b80f578880..681af3f5cb4db0 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -18,9 +18,9 @@ from typing import Dict, List, Optional, Union -from ...image_utils import ChannelDimension +from ...image_utils import ChannelDimension, PILImageResampling from ...processing_utils import ProcessorMixin -from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, TruncationStrategy, TextInput, PreTokenizedInput +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy from ...utils import TensorType @@ -50,11 +50,11 @@ def __init__(self, image_processor, tokenizer): def __call__( self, images, - text:Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]=None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, size_divisor: Optional[int] = None, - resample: "PILImageResampling" = None, + resample: PILImageResampling = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[float] = None, do_normalize: Optional[bool] = None, diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index 8a8a94c8ea6b69..78b96ff3f75753 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -144,7 +144,6 @@ def image_processor_dict(self): return self.image_processor_tester.prepare_image_processor_dict() def test_image_processor_properties(self): - print(self.image_processor_dict) image_processing = self.image_processing_class(**self.image_processor_dict) self.assertTrue(hasattr(image_processing, "image_mean")) self.assertTrue(hasattr(image_processing, "image_std")) From 68c2f40f6a885f05924c544834ba8cd9e67415b6 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 1 Feb 2024 18:18:56 +0000 Subject: [PATCH 19/29] log kwargs mismatch to info level --- .../image_processing_bridgetower.py | 49 +++++++++++-------- .../image_processing_chinese_clip.py | 24 +++++++++ 2 files changed, 52 insertions(+), 21 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index dce25aab84fa28..e639f80a79efec 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -205,27 +205,7 @@ def __init__( input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 **kwargs, ) -> None: - valid_processor_keys = { - "do_resize", - "size", - "size_divisor", - "resample", - "do_rescale", - "rescale_factor", - "do_normalize", - "image_mean", - "image_std", - "do_center_crop", - "do_pad", - "pad_and_return_pixel_mask", - "return_tensors", - } - - unused_keys = set(kwargs.keys()) - valid_processor_keys - if unused_keys: - unused_key_str = ", ".join(unused_keys) - logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") - + if pad_and_return_pixel_mask: do_pad = pad_and_return_pixel_mask @@ -430,6 +410,7 @@ def preprocess( data_format: ChannelDimension = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, pad_and_return_pixel_mask: Optional[bool] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. @@ -487,6 +468,32 @@ def preprocess( Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also created and returned. Deprecated version of do_pad. """ + valid_processor_keys = { + "images", + "do_resize", + "size", + "size_divisor", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_pad", + "do_center_crop", + "return_tensors", + "data_format", + "input_data_format", + "pad_and_return_pixel_mask", + } + + unused_keys = set(kwargs.keys()) - valid_processor_keys + if unused_keys: + unused_key_str = ", ".join(unused_keys) + logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") + + + do_resize = do_resize if do_resize is not None else self.do_resize size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py index 4f1048a45e6ac6..c49027d4bd3e10 100644 --- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -230,6 +230,30 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. """ + valid_processor_keys = { + "images", + "do_resize", + "size", + "resample", + "do_center_crop", + "crop_size", + "do_rescale", + "rescale_factor", + "do_normalize", + "image_mean", + "image_std", + "do_convert_rgb", + "return_tensors", + "data_format", + "input_data_format", + } + + unused_keys = set(kwargs.keys()) - valid_processor_keys + if unused_keys: + unused_key_str = ", ".join(unused_keys) + logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") + + do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) From e1e408421aecca34f4bb2b00635eb712b034d422 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 1 Feb 2024 18:24:21 +0000 Subject: [PATCH 20/29] fix quality --- .../models/bridgetower/image_processing_bridgetower.py | 3 --- .../models/chinese_clip/image_processing_chinese_clip.py | 1 - 2 files changed, 4 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index e639f80a79efec..ee53893e06b3e6 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -205,7 +205,6 @@ def __init__( input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 **kwargs, ) -> None: - if pad_and_return_pixel_mask: do_pad = pad_and_return_pixel_mask @@ -492,8 +491,6 @@ def preprocess( unused_key_str = ", ".join(unused_keys) logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") - - do_resize = do_resize if do_resize is not None else self.do_resize size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py index c49027d4bd3e10..fa62db7139de87 100644 --- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -253,7 +253,6 @@ def preprocess( unused_key_str = ", ".join(unused_keys) logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") - do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size size = get_size_dict(size, default_to_square=False) From 4b557b0ac30da058b38f7e97f328a9eb6fc60c82 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 15 Feb 2024 15:04:01 +0000 Subject: [PATCH 21/29] address comments --- src/transformers/models/blip/processing_blip.py | 1 - .../models/bridgetower/image_processing_bridgetower.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/blip/processing_blip.py b/src/transformers/models/blip/processing_blip.py index 605a11a154d391..3a9f9332afae25 100644 --- a/src/transformers/models/blip/processing_blip.py +++ b/src/transformers/models/blip/processing_blip.py @@ -79,7 +79,6 @@ def __call__( text_encoding = None if text is not None: - self.current_processor = self.tokenizer text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 26d17f5045faf8..996d856de35336 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -197,8 +197,8 @@ def __init__( do_normalize: bool = True, image_mean: Optional[Union[float, List[float]]] = None, image_std: Optional[Union[float, List[float]]] = None, - do_center_crop: Optional[bool] = True, - do_pad: Optional[bool] = True, + do_center_crop: bool = True, + do_pad: bool = True, pad_and_return_pixel_mask: Optional[bool] = None, return_tensors: Optional[bool] = None, data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 From b7fc377f823749dda234b33a5f384da8be3cc8ba Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 15 Feb 2024 15:58:19 +0000 Subject: [PATCH 22/29] fix typo --- utils/check_docstrings.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index 7c895163d95988..32d57b2976289c 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -827,7 +827,7 @@ def find_indent(line: str) -> int: def stringify_default(default: Any) -> str: """ Returns the string representation of a default value, as used in docstring: numbers are left as is, all other - objects are in backtiks. + objects are in backticks. Args: default (`Any`): The default value to process @@ -857,7 +857,7 @@ def stringify_default(default: Any) -> str: def eval_math_expression(expression: str) -> Optional[Union[float, int]]: # Mainly taken from the excellent https://stackoverflow.com/a/9558001 """ - Evaluate (safely) a mathematial expression and returns its value. + Evaluate (safely) a mathematical expression and returns its value. Args: expression (`str`): The expression to evaluate. From 270bb9e2f2935af8d05909a5fb801a55a2ec0053 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Fri, 16 Feb 2024 14:04:13 +0000 Subject: [PATCH 23/29] fix expected tests for bridgetower --- src/transformers/models/blip_2/processing_blip_2.py | 1 - .../models/bridgetower/processing_bridgetower.py | 1 - .../models/bridgetower/test_image_processing_bridgetower.py | 5 +++-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/blip_2/processing_blip_2.py b/src/transformers/models/blip_2/processing_blip_2.py index 0b11db42e53095..f160e3431d3e0e 100644 --- a/src/transformers/models/blip_2/processing_blip_2.py +++ b/src/transformers/models/blip_2/processing_blip_2.py @@ -81,7 +81,6 @@ def __call__( text_encoding = None if text is not None: - self.current_processor = self.tokenizer text_encoding = self.tokenizer( text=text, add_special_tokens=add_special_tokens, diff --git a/src/transformers/models/bridgetower/processing_bridgetower.py b/src/transformers/models/bridgetower/processing_bridgetower.py index 681af3f5cb4db0..56d5a756ca97ee 100644 --- a/src/transformers/models/bridgetower/processing_bridgetower.py +++ b/src/transformers/models/bridgetower/processing_bridgetower.py @@ -106,7 +106,6 @@ def __call__( verbose=verbose, ) # add pixel_values + pixel_mask - print(size) encoding_image_processor = self.image_processor( images, do_resize=do_resize, diff --git a/tests/models/bridgetower/test_image_processing_bridgetower.py b/tests/models/bridgetower/test_image_processing_bridgetower.py index 78b96ff3f75753..84350d34575546 100644 --- a/tests/models/bridgetower/test_image_processing_bridgetower.py +++ b/tests/models/bridgetower/test_image_processing_bridgetower.py @@ -39,7 +39,7 @@ def __init__( do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, - do_center_crop: bool = False, + do_center_crop: bool = False, # Current expected shape are not center-cropped. image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073], image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711], do_pad: bool = True, @@ -66,9 +66,10 @@ def __init__( def prepare_image_processor_dict(self): return { + "do_normalize": self.do_normalize, "image_mean": self.image_mean, "image_std": self.image_std, - "do_normalize": self.do_normalize, + "do_pad": self.do_pad, "do_resize": self.do_resize, "do_center_crop": self.do_center_crop, "size": self.size, From 004c961d6b167431d17e23140281c681143ea216 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 26 Feb 2024 09:57:03 +0000 Subject: [PATCH 24/29] fix valid processor keys --- .../image_processing_bridgetower.py | 25 +------------------ 1 file changed, 1 insertion(+), 24 deletions(-) diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index 2a80978eebe02a..ef62949745fef4 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -246,6 +246,7 @@ def __init__( "return_tensors", "data_format", "input_data_format", + "pad_and_return_pixel_mask" ] # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize @@ -493,30 +494,6 @@ def preprocess( Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also created and returned. Deprecated version of do_pad. """ - valid_processor_keys = { - "images", - "do_resize", - "size", - "size_divisor", - "resample", - "do_rescale", - "rescale_factor", - "do_normalize", - "image_mean", - "image_std", - "do_pad", - "do_center_crop", - "return_tensors", - "data_format", - "input_data_format", - "pad_and_return_pixel_mask", - } - - unused_keys = set(kwargs.keys()) - valid_processor_keys - if unused_keys: - unused_key_str = ", ".join(unused_keys) - logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") - do_resize = do_resize if do_resize is not None else self.do_resize size_divisor = size_divisor if size_divisor is not None else self.size_divisor resample = resample if resample is not None else self.resample From c2e49f542ca5000c4506d2bfb5ff4c29ddd59afd Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 26 Feb 2024 10:00:18 +0000 Subject: [PATCH 25/29] remove unused arg list --- .../image_processing_chinese_clip.py | 22 ------------------- tests/test_image_processing_common.py | 1 + 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py index 98495e04c6558e..d92caa79923e92 100644 --- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py @@ -249,28 +249,6 @@ def preprocess( - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. """ - valid_processor_keys = { - "images", - "do_resize", - "size", - "resample", - "do_center_crop", - "crop_size", - "do_rescale", - "rescale_factor", - "do_normalize", - "image_mean", - "image_std", - "do_convert_rgb", - "return_tensors", - "data_format", - "input_data_format", - } - - unused_keys = set(kwargs.keys()) - valid_processor_keys - if unused_keys: - unused_key_str = ", ".join(unused_keys) - logger.info(f"Unused or unrecognized configuration parameters: {unused_key_str}.") do_resize = do_resize if do_resize is not None else self.do_resize size = size if size is not None else self.size diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py index 90c1a4e7e12708..074da8a2bbab38 100644 --- a/tests/test_image_processing_common.py +++ b/tests/test_image_processing_common.py @@ -291,6 +291,7 @@ def test_call_numpy_4_channels(self): ) def test_image_processor_preprocess_arguments(self): + # Test that an instantiated image processor is called with the correct arg spec image_processor = self.image_processing_class(**self.image_processor_dict) if hasattr(image_processor, "_valid_processor_keys") and hasattr(image_processor, "preprocess"): preprocess_parameter_names = inspect.getfullargspec(image_processor.preprocess).args From 79958b5802b98caf6c93561c428d479eac1b4cf2 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Mon, 26 Feb 2024 10:25:44 +0000 Subject: [PATCH 26/29] quality --- src/transformers/models/bit/image_processing_bit.py | 1 + .../models/bridgetower/image_processing_bridgetower.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py index d91c050662ba0c..c9d5c7a7594a49 100644 --- a/src/transformers/models/bit/image_processing_bit.py +++ b/src/transformers/models/bit/image_processing_bit.py @@ -207,6 +207,7 @@ def preprocess( return_tensors: Optional[Union[str, TensorType]] = None, data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, ) -> PIL.Image.Image: """ Preprocess an image or batch of images. diff --git a/src/transformers/models/bridgetower/image_processing_bridgetower.py b/src/transformers/models/bridgetower/image_processing_bridgetower.py index ef62949745fef4..bf3c37b9b7cd2c 100644 --- a/src/transformers/models/bridgetower/image_processing_bridgetower.py +++ b/src/transformers/models/bridgetower/image_processing_bridgetower.py @@ -246,7 +246,7 @@ def __init__( "return_tensors", "data_format", "input_data_format", - "pad_and_return_pixel_mask" + "pad_and_return_pixel_mask", ] # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize From 3238dd35cd82d686a6eeffa25d025f4de62ad7e2 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 18 Apr 2024 16:35:03 +0000 Subject: [PATCH 27/29] skeleton draft - uniform processor call --- .../models/donut/processing_donut.py | 139 +++++++----------- 1 file changed, 57 insertions(+), 82 deletions(-) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index ae2588b85e2040..38e9b9426f8fd7 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -64,46 +64,61 @@ def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None) super().__init__(image_processor, tokenizer) self.current_processor = self.image_processor + + self.processing_kwargs = { + 'common_kwargs':{ + 'return_tensors': 'pt' + }, + 'text_kwargs': { + 'text_pair': None, + 'text_target': None, + 'text_pair_target': None, + 'add_special_tokens': True, + 'padding': 'max_length', + 'truncation': True, + 'max_length': 512, + 'stride': 0, + 'is_split_into_words': False, + 'pad_to_multiple_of': None, + 'return_token_type_ids': True, + 'return_attention_mask': True, + 'return_overflowing_tokens': False, + 'return_special_tokens_mask': False, + 'return_offsets_mapping': False, + 'return_length': False, + 'verbose': True + }, + 'images_kwargs': { + 'do_crop_margin': False, + 'do_resize': True, + 'size': {'height': 256, 'width': 256}, + 'resample': 'bilinear', + 'do_thumbnail': False, + 'do_align_long_axis': False, + 'do_pad': False, + 'do_rescale': False, + 'rescale_factor': 1.0, + 'do_normalize': True, + 'image_mean': [0.485, 0.456, 0.406], + 'image_std': [0.229, 0.224, 0.225], + 'data_format': 'channels_first', + 'input_data_format': None + }, + 'audio_kwargs': { + }, + 'videos_kwargs': { + }, + } + self._in_target_context_manager = False def __call__( self, text=None, images=None, - do_crop_margin: bool = None, - do_resize: bool = None, - size: Dict[str, int] = None, - resample: "PILImageResampling" = None, # noqa: F821 - do_thumbnail: bool = None, - do_align_long_axis: bool = None, - do_pad: bool = None, - do_rescale: bool = None, - rescale_factor: Union[int, float] = None, - do_normalize: bool = None, - image_mean: Optional[Union[float, List[float]]] = None, - image_std: Optional[Union[float, List[float]]] = None, - data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821 - input_data_format: Optional[Union[str, "ChannelDimension"]] = None, # noqa: F821 - text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, - text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - text_pair_target: Optional[ - Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] - ] = None, - add_special_tokens: bool = True, - padding: Union[bool, str, PaddingStrategy] = False, - truncation: Union[bool, str, TruncationStrategy] = None, - max_length: Optional[int] = None, - stride: int = 0, - is_split_into_words: bool = False, - pad_to_multiple_of: Optional[int] = None, - return_tensors: Optional[Union[str, TensorType]] = None, - return_token_type_ids: Optional[bool] = None, - return_attention_mask: Optional[bool] = None, - return_overflowing_tokens: bool = False, - return_special_tokens_mask: bool = False, - return_offsets_mapping: bool = False, - return_length: bool = False, - verbose: bool = True, + audio=None, + videos=None, # end of supported modalities in call + **kwargs ): """ When used in normal mode, this method forwards all its arguments to AutoImageProcessor's @@ -113,68 +128,28 @@ def __call__( """ # For backward compatibility if self._in_target_context_manager: + image_kwargs = {**self.processing_kwargs.get('images_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} return self.current_processor( images, - do_crop_margin=do_crop_margin, - do_resize=do_resize, - size=size, - resample=resample, - do_thumbnail=do_thumbnail, - do_align_long_axis=do_align_long_axis, - do_pad=do_pad, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - return_tensors=return_tensors, - data_format=data_format, - input_data_format=input_data_format, + **image_kwargs, ) if images is None and text is None: raise ValueError("You need to specify either an `images` or `text` input to process.") if images is not None: + image_kwargs = {**self.processing_kwargs.get('images_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} inputs = self.image_processor( images, - do_crop_margin=do_crop_margin, - do_resize=do_resize, - size=size, - resample=resample, - do_thumbnail=do_thumbnail, - do_align_long_axis=do_align_long_axis, - do_pad=do_pad, - do_rescale=do_rescale, - rescale_factor=rescale_factor, - do_normalize=do_normalize, - image_mean=image_mean, - image_std=image_std, - return_tensors=return_tensors, - data_format=data_format, - input_data_format=input_data_format, + **image_kwargs, + ) if text is not None: + text_kwargs = {**self.processing_kwargs.get('text_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} + encodings = self.tokenizer( text, - text_pair=text_pair, - text_target=text_target, - text_pair_target=text_pair_target, - add_special_tokens=add_special_tokens, - padding=padding, - truncation=truncation, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - pad_to_multiple_of=pad_to_multiple_of, - return_tensors=return_tensors, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - return_offsets_mapping=return_offsets_mapping, - return_length=return_length, - verbose=verbose, + **text_kwargs, ) if text is None: From 3afde2299dde1c47acc7195780720fdaf9eb9bae Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 18 Apr 2024 16:48:01 +0000 Subject: [PATCH 28/29] fix quality --- .../models/donut/processing_donut.py | 103 +++++++++--------- 1 file changed, 53 insertions(+), 50 deletions(-) diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py index 38e9b9426f8fd7..809e3ddf57e68e 100644 --- a/src/transformers/models/donut/processing_donut.py +++ b/src/transformers/models/donut/processing_donut.py @@ -18,12 +18,8 @@ import re import warnings from contextlib import contextmanager -from typing import Dict, List, Optional, Union - -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput, TruncationStrategy from ...processing_utils import ProcessorMixin -from ...utils import PaddingStrategy, TensorType class DonutProcessor(ProcessorMixin): @@ -66,48 +62,44 @@ def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None) self.current_processor = self.image_processor self.processing_kwargs = { - 'common_kwargs':{ - 'return_tensors': 'pt' - }, - 'text_kwargs': { - 'text_pair': None, - 'text_target': None, - 'text_pair_target': None, - 'add_special_tokens': True, - 'padding': 'max_length', - 'truncation': True, - 'max_length': 512, - 'stride': 0, - 'is_split_into_words': False, - 'pad_to_multiple_of': None, - 'return_token_type_ids': True, - 'return_attention_mask': True, - 'return_overflowing_tokens': False, - 'return_special_tokens_mask': False, - 'return_offsets_mapping': False, - 'return_length': False, - 'verbose': True - }, - 'images_kwargs': { - 'do_crop_margin': False, - 'do_resize': True, - 'size': {'height': 256, 'width': 256}, - 'resample': 'bilinear', - 'do_thumbnail': False, - 'do_align_long_axis': False, - 'do_pad': False, - 'do_rescale': False, - 'rescale_factor': 1.0, - 'do_normalize': True, - 'image_mean': [0.485, 0.456, 0.406], - 'image_std': [0.229, 0.224, 0.225], - 'data_format': 'channels_first', - 'input_data_format': None + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": { + "text_pair": None, + "text_target": None, + "text_pair_target": None, + "add_special_tokens": True, + "padding": "max_length", + "truncation": True, + "max_length": 512, + "stride": 0, + "is_split_into_words": False, + "pad_to_multiple_of": None, + "return_token_type_ids": True, + "return_attention_mask": True, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, }, - 'audio_kwargs': { - }, - 'videos_kwargs': { + "images_kwargs": { + "do_crop_margin": False, + "do_resize": True, + "size": {"height": 256, "width": 256}, + "resample": "bilinear", + "do_thumbnail": False, + "do_align_long_axis": False, + "do_pad": False, + "do_rescale": False, + "rescale_factor": 1.0, + "do_normalize": True, + "image_mean": [0.485, 0.456, 0.406], + "image_std": [0.229, 0.224, 0.225], + "data_format": "channels_first", + "input_data_format": None, }, + "audio_kwargs": {}, + "videos_kwargs": {}, } self._in_target_context_manager = False @@ -117,8 +109,8 @@ def __call__( text=None, images=None, audio=None, - videos=None, # end of supported modalities in call - **kwargs + videos=None, # end of supported modalities in call + **kwargs, ): """ When used in normal mode, this method forwards all its arguments to AutoImageProcessor's @@ -128,7 +120,11 @@ def __call__( """ # For backward compatibility if self._in_target_context_manager: - image_kwargs = {**self.processing_kwargs.get('images_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} + image_kwargs = { + **self.processing_kwargs.get("images_kwargs", {}), + **self.processing_kwargs.get("common_kwargs"), + **kwargs, + } return self.current_processor( images, **image_kwargs, @@ -138,14 +134,21 @@ def __call__( raise ValueError("You need to specify either an `images` or `text` input to process.") if images is not None: - image_kwargs = {**self.processing_kwargs.get('images_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} + image_kwargs = { + **self.processing_kwargs.get("images_kwargs", {}), + **self.processing_kwargs.get("common_kwargs"), + **kwargs, + } inputs = self.image_processor( images, **image_kwargs, - ) if text is not None: - text_kwargs = {**self.processing_kwargs.get('text_kwargs', {}), **self.processing_kwargs.get('common_kwargs'), **kwargs} + text_kwargs = { + **self.processing_kwargs.get("text_kwargs", {}), + **self.processing_kwargs.get("common_kwargs"), + **kwargs, + } encodings = self.tokenizer( text, From eb99e290081f43001b2fe8746e8c4cf28976a206 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo Date: Thu, 25 Apr 2024 12:30:25 +0000 Subject: [PATCH 29/29] add broken wav2vec audio processing --- .../models/wav2vec2/processing_wav2vec2.py | 62 +++++++++++++++---- 1 file changed, 49 insertions(+), 13 deletions(-) diff --git a/src/transformers/models/wav2vec2/processing_wav2vec2.py b/src/transformers/models/wav2vec2/processing_wav2vec2.py index dc6e9d14ee66cd..47d020343fbd4d 100644 --- a/src/transformers/models/wav2vec2/processing_wav2vec2.py +++ b/src/transformers/models/wav2vec2/processing_wav2vec2.py @@ -45,6 +45,37 @@ def __init__(self, feature_extractor, tokenizer): super().__init__(feature_extractor, tokenizer) self.current_processor = self.feature_extractor self._in_target_context_manager = False + self.processing_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "text_kwargs": { + "text_pair": None, + "text_target": None, + "text_pair_target": None, + "add_special_tokens": True, + "padding": "max_length", + "truncation": True, + "max_length": 512, + "stride": 0, + "is_split_into_words": False, + "pad_to_multiple_of": None, + "return_token_type_ids": True, + "return_attention_mask": True, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": { + }, + "audio_kwargs": { + "sampling_rate": None, + "raw_speech": None, + }, + "videos_kwargs": { + }, + } + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): @@ -65,7 +96,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): return cls(feature_extractor=feature_extractor, tokenizer=tokenizer) - def __call__(self, *args, **kwargs): + def __call__( + self, + audio=None, + text=None, + images=None, + videos=None, # end of supported modalities in call + *deprecated_args, + **deprecated_kwargs, + ): """ When used in normal mode, this method forwards all its arguments to Wav2Vec2FeatureExtractor's [`~Wav2Vec2FeatureExtractor.__call__`] and returns its output. If used in the context @@ -74,26 +113,23 @@ def __call__(self, *args, **kwargs): """ # For backward compatibility if self._in_target_context_manager: - return self.current_processor(*args, **kwargs) - - if "raw_speech" in kwargs: + return self.current_processor(audio, *deprecated_args, **self.processing_kwargs['audio_kwargs'], **deprecated_kwargs) + print(deprecated_args, deprecated_kwargs) + if "raw_speech" in deprecated_kwargs: + breakpoint() warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.") - audio = kwargs.pop("raw_speech") + audio = deprecated_kwargs.pop("raw_speech") else: - audio = kwargs.pop("audio", None) - sampling_rate = kwargs.pop("sampling_rate", None) - text = kwargs.pop("text", None) - if len(args) > 0: - audio = args[0] - args = args[1:] + audio = deprecated_kwargs.pop("audio", None) + sampling_rate = deprecated_kwargs.pop("sampling_rate", None) if audio is None and text is None: raise ValueError("You need to specify either an `audio` or `text` input to process.") if audio is not None: - inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs) + inputs = self.feature_extractor(audio, *deprecated_args, sampling_rate=sampling_rate, **deprecated_kwargs) if text is not None: - encodings = self.tokenizer(text, **kwargs) + encodings = self.tokenizer(text, *deprecated_args, **deprecated_kwargs) if text is None: return inputs