Skip to content

Commit

Permalink
it's Friday night, let cross finger
Browse files Browse the repository at this point in the history
  • Loading branch information
ydshieh committed Dec 13, 2024
1 parent b1db4f2 commit f8c98d6
Showing 1 changed file with 50 additions and 14 deletions.
64 changes: 50 additions & 14 deletions src/transformers/models/kosmos2_5/processing_kosmos2_5.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,41 @@
from typing import List, Optional, Union

from ...image_processing_utils import BatchFeature
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
from ...utils import TensorType, is_torch_available
from ...image_utils import ImageInput
from ...processing_utils import ImagesKwargs, ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
from ...tokenization_utils_base import TextInput
from ...utils import is_torch_available


if is_torch_available():
import torch


class Kosmos2_5ImagesKwargs(ImagesKwargs, total=False):
max_patches: Optional[int]
num_image_tokens: Optional[int]


class Kosmos2_5ProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: TextKwargs
images_kwargs: Kosmos2_5ImagesKwargs
_defaults = {
"text_kwargs": {
"padding": True,
"truncation": True,
"max_length": None,
"stride": 0,
"pad_to_multiple_of": None,
"return_attention_mask": None,
"return_tensors": "pt",
},
"images_kwargs": {
"max_patches": 4096,
"num_image_tokens": 2048,
},
}


class Kosmos2_5Processor(ProcessorMixin):
r"""
Constructs a Kosmos2_5 processor which wraps a PreTrainedTokenizerFast and Kosmos2_5 image processor into a single
Expand Down Expand Up @@ -58,18 +84,11 @@ def __init__(self, image_processor, tokenizer):

def __call__(
self,
images=None,
images: ImageInput = None,
text: Union[TextInput, List[TextInput]] = None,
padding: Union[bool, str, PaddingStrategy] = True,
truncation: Union[bool, str, TruncationStrategy] = True,
max_length: Optional[int] = None,
max_patches: Optional[int] = 4096,
num_image_tokens: Optional[int] = 2048,
stride: int = 0,
pad_to_multiple_of: Optional[int] = None,
return_attention_mask: Optional[bool] = None,
return_tensors: Optional[Union[str, TensorType]] = "pt",
**kwargs,
audio=None,
videos=None,
**kwargs: Unpack[Kosmos2_5ProcessorKwargs],
) -> BatchFeature:
"""
This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
Expand All @@ -85,6 +104,23 @@ def __call__(
if images is None:
raise ValueError("Kosmos2_5Processor requires images to be passed.")

output_kwargs = self._merge_kwargs(
Kosmos2_5ProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)

max_patches = output_kwargs["images_kwargs"].setdefault("max_patches", None)
num_image_tokens = output_kwargs["images_kwargs"].setdefault("num_image_tokens", None)

padding = output_kwargs["text_kwargs"].setdefault("padding", None)
truncation = output_kwargs["text_kwargs"].setdefault("truncation", None)
max_length = output_kwargs["text_kwargs"].setdefault("max_length", None)
stride = output_kwargs["text_kwargs"].setdefault("stride", None)
pad_to_multiple_of = output_kwargs["text_kwargs"].setdefault("pad_to_multiple_of", None)
return_attention_mask = output_kwargs["text_kwargs"].setdefault("return_attention_mask", None)
return_tensors = output_kwargs["text_kwargs"].setdefault("return_tensors", None)

encoding = BatchFeature()

if images is not None:
Expand Down

0 comments on commit f8c98d6

Please sign in to comment.