diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 5d04d7b6134455..3e1884271efe2b 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -40,9 +40,9 @@ class VideoLlavaProcessor(ProcessorMixin): The image processor is a required input. tokenizer ([`LlamaTokenizerFast`], *optional*): The tokenizer is a required input. - patch_size (`int`, *optional*): + patch_size (`int`, *optional*, defaults to 14): Patch size from the vision tower. - vision_feature_select_strategy (`str`, *optional*): + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): The feature selection strategy used to select the vision feature from the vision backbone. Shoudl be same as in model's config image_token (`str`, *optional*, defaults to `""`): @@ -51,7 +51,7 @@ class VideoLlavaProcessor(ProcessorMixin): Special token used to denote video location. chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. - num_additional_image_tokens (`int`, *optional*, defaults to 0): + num_additional_image_tokens (`int`, *optional*, defaults to 1): Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other extra tokens appended, no need to set this arg. """ @@ -72,12 +72,12 @@ def __init__( self, image_processor=None, tokenizer=None, - patch_size=None, - vision_feature_select_strategy=None, + patch_size=14, + vision_feature_select_strategy="default", image_token="", # set the default and let users change if they have peculiar special tokens in rare cases video_token="