diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index d82c2c017fadaa..177bb2746391e0 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1160,6 +1160,7 @@ _import_structure["models.swin2sr"].append("Swin2SRImageProcessor") _import_structure["models.tvlt"].append("TvltImageProcessor") _import_structure["models.tvp"].append("TvpImageProcessor") + _import_structure["models.video_llava"].append("VideoLlavaImageProcessor") _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"]) _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"]) _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"]) @@ -3243,7 +3244,6 @@ _import_structure["models.video_llava"].extend( [ "VideoLlavaForConditionalGeneration", - "VideoLlavaImageProcessor", "VideoLlavaPreTrainedModel", "VideoLlavaProcessor", ] @@ -5759,6 +5759,7 @@ from .models.swin2sr import Swin2SRImageProcessor from .models.tvlt import TvltImageProcessor from .models.tvp import TvpImageProcessor + from .models.video_llava import VideoLlavaImageProcessor from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor from .models.vit import ViTFeatureExtractor, ViTImageProcessor @@ -7437,7 +7438,6 @@ ) from .models.video_llava import ( VideoLlavaForConditionalGeneration, - VideoLlavaImageProcessor, VideoLlavaPreTrainedModel, VideoLlavaProcessor, ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index be3547dcfcccbb..681f8585566f3c 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -8417,13 +8417,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class VideoLlavaImageProcessor(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class VideoLlavaPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 7510f91dfcd5d3..d32778d4b5f681 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -534,6 +534,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class VideoLlavaImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class VideoMAEFeatureExtractor(metaclass=DummyObject): _backends = ["vision"]