diff --git a/src/transformers/models/got_ocr2/modular_got_ocr2.py b/src/transformers/models/got_ocr2/modular_got_ocr2.py index e8ad27f8d776c6..5b699e55d11a0d 100644 --- a/src/transformers/models/got_ocr2/modular_got_ocr2.py +++ b/src/transformers/models/got_ocr2/modular_got_ocr2.py @@ -46,8 +46,10 @@ from ...modeling_outputs import CausalLMOutputWithPast from ...utils import ( ModelOutput, + add_start_docstrings_to_model_forward, is_vision_available, logging, + replace_return_docstrings, ) @@ -58,6 +60,8 @@ logger = logging.get_logger(__name__) +_CONFIG_FOR_DOC = "GotOcr2Config" + class GotOcr2VisionConfig(PretrainedConfig): r""" @@ -167,6 +171,8 @@ class GotOcr2ImagesKwargs(ImagesKwargs, total=False): num_image_tokens: Optional[int] multi_page: Optional[bool] crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): @@ -179,6 +185,8 @@ class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): }, "images_kwargs": { "num_image_tokens": 256, + "min_patches": 1, + "max_patches": 6, }, } @@ -359,6 +367,12 @@ def __call__( If set, will enable multi-page inference. The model will return the OCR result across multiple pages. crop_to_patches (`bool`, *optional*): If set, will crop the image to patches. The model will return the OCR result upon the patch reference. + min_patches (`int`, *optional*): + The minimum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to + `True`. + max_patches (`int`, *optional*): + The maximum number of patches to be cropped from the image. Only used when `crop_to_patches` is set to + `True`. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: @@ -429,7 +443,10 @@ def __call__( for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)): if crop_to_patches: image_group = self.image_processor.crop_image_to_patches( - image_group, size=output_kwargs["images_kwargs"].get("size", None) + image_group, + size=output_kwargs["images_kwargs"].get("size"), + min_num=output_kwargs["images_kwargs"].get("min_patches"), + max_num=output_kwargs["images_kwargs"].get("max_patches"), ) images[index] = image_group num_images = len(image_group) if (multi_page or crop_to_patches) else 1 @@ -468,14 +485,14 @@ def __call__( def batch_decode(self, *args, **kwargs): """ - This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please refer to the docstring of this method for more information. """ return self.tokenizer.batch_decode(*args, **kwargs) def decode(self, *args, **kwargs): """ - This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to the docstring of this method for more information. """ return self.tokenizer.decode(*args, **kwargs) @@ -572,6 +589,76 @@ class GotOcr2Model(Qwen2Model): pass +GOT_OCR2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + pixel_values (`torch.FloatTensor` of shape `(seq_length, num_channels * image_size * image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`GotOcr2Processor`] uses + [`GotOcr2ImageProcessor`] for processing images. +""" + + class GotOcr2ForConditionalGeneration(GotOcr2PreTrainedModel, GenerationMixin): _tied_weights_keys = ["lm_head.weight"] @@ -620,6 +707,8 @@ def _update_model_kwargs_for_generation( return model_kwargs + @add_start_docstrings_to_model_forward(GOT_OCR2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) def forward( self, input_ids: torch.LongTensor = None, diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index ffc0fb1742c32a..1718b651a91db7 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -50,6 +50,8 @@ class GotOcr2ImagesKwargs(ImagesKwargs, total=False): num_image_tokens: Optional[int] multi_page: Optional[bool] crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): @@ -62,6 +64,8 @@ class GotOcr2ProcessorKwargs(ProcessingKwargs, total=False): }, "images_kwargs": { "num_image_tokens": 256, + "min_patches": 1, + "max_patches": 6, }, } @@ -213,7 +217,10 @@ def __call__( for index, (image_group, box_single, color_single) in enumerate(zip(images, box, color)): if crop_to_patches: image_group = self.image_processor.crop_image_to_patches( - image_group, size=output_kwargs["images_kwargs"].get("size", None) + image_group, + size=output_kwargs["images_kwargs"].get("size"), + min_num=output_kwargs["images_kwargs"].get("min_patches"), + max_num=output_kwargs["images_kwargs"].get("max_patches"), ) images[index] = image_group num_images = len(image_group) if (multi_page or crop_to_patches) else 1