diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index db9d1276832b39..38173cbd861fc1 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -165,7 +165,8 @@ def __call__( image_size = next(image_sizes) if not isinstance(image_size, (list, tuple)): # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = image_size.tolist() + image_size = image_size.tolist() + orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= self.num_additional_image_tokens diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 5805782f779e5f..65195b77240721 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -192,7 +192,8 @@ def __call__( image_size = next(image_sizes) if not isinstance(image_size, (list, tuple)): # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = image_size.tolist() + image_size = image_size.tolist() + orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= self.num_additional_image_tokens diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index 4f67f9e4c030e8..ff808802bcaa63 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -191,7 +191,8 @@ def _expand_image_tokens( original_size = image_size_list[0] if num_frames != 1 else image_size_list if not isinstance(original_size, (list, tuple)): # cast to list to avoid numerical precision errors when calculating unpadding - orig_height, orig_width = original_size.tolist() + original_size = original_size.tolist() + orig_height, orig_width = original_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1