diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index ae6aa85cce7e15..7576cd8eeb46a8 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.45.0.dev0" +__version__ = "4.48.0.dev0" from typing import TYPE_CHECKING @@ -57,7 +57,8 @@ "agents": [ "Agent", "CodeAgent", - "HfEngine", + "HfApiEngine", + "ManagedAgent", "PipelineTool", "ReactAgent", "ReactCodeAgent", @@ -65,9 +66,11 @@ "Tool", "Toolbox", "ToolCollection", + "TransformersEngine", "launch_gradio_demo", "load_tool", "stream_to_gradio", + "tool", ], "audio_utils": [], "benchmark": [], @@ -112,7 +115,6 @@ "data.metrics": [], "data.processors": [], "debug_utils": [], - "deepspeed": [], "dependency_versions_check": [], "dependency_versions_table": [], "dynamic_module_utils": [], @@ -120,6 +122,7 @@ "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"], "file_utils": [], "generation": [ + "CompileConfig", "GenerationConfig", "TextIteratorStreamer", "TextStreamer", @@ -140,7 +143,9 @@ "is_tensorboard_available", "is_wandb_available", ], + "loss": [], "modelcard": ["ModelCard"], + # Losses "modeling_tf_pytorch_utils": [ "convert_tf_weight_name_to_pt_weight_name", "load_pytorch_checkpoint_in_tf2_model", @@ -165,6 +170,11 @@ "AltCLIPTextConfig", "AltCLIPVisionConfig", ], + "models.aria": [ + "AriaConfig", + "AriaProcessor", + "AriaTextConfig", + ], "models.audio_spectrogram_transformer": [ "ASTConfig", "ASTFeatureExtractor", @@ -451,6 +461,7 @@ "GitProcessor", "GitVisionConfig", ], + "models.glm": ["GlmConfig"], "models.glpn": ["GLPNConfig"], "models.gpt2": [ "GPT2Config", @@ -462,6 +473,8 @@ "models.gpt_neox_japanese": ["GPTNeoXJapaneseConfig"], "models.gpt_sw3": [], "models.gptj": ["GPTJConfig"], + "models.granite": ["GraniteConfig"], + "models.granitemoe": ["GraniteMoeConfig"], "models.grounding_dino": [ "GroundingDinoConfig", "GroundingDinoProcessor", @@ -477,6 +490,8 @@ "models.ibert": ["IBertConfig"], "models.idefics": ["IdeficsConfig"], "models.idefics2": ["Idefics2Config"], + "models.idefics3": ["Idefics3Config"], + "models.ijepa": ["IJepaConfig"], "models.imagegpt": ["ImageGPTConfig"], "models.informer": ["InformerConfig"], "models.instructblip": [ @@ -532,6 +547,7 @@ "LlavaNextVideoConfig", "LlavaNextVideoProcessor", ], + "models.llava_onevision": ["LlavaOnevisionConfig", "LlavaOnevisionProcessor"], "models.longformer": [ "LongformerConfig", "LongformerTokenizer", @@ -569,8 +585,13 @@ "MgpstrProcessor", "MgpstrTokenizer", ], + "models.mimi": ["MimiConfig"], "models.mistral": ["MistralConfig"], "models.mixtral": ["MixtralConfig"], + "models.mllama": [ + "MllamaConfig", + "MllamaProcessor", + ], "models.mluke": [], "models.mobilebert": [ "MobileBertConfig", @@ -580,6 +601,10 @@ "models.mobilenet_v2": ["MobileNetV2Config"], "models.mobilevit": ["MobileViTConfig"], "models.mobilevitv2": ["MobileViTV2Config"], + "models.moshi": [ + "MoshiConfig", + "MoshiDepthConfig", + ], "models.mpnet": [ "MPNetConfig", "MPNetTokenizer", @@ -596,12 +621,19 @@ "MusicgenMelodyDecoderConfig", ], "models.mvp": ["MvpConfig", "MvpTokenizer"], + "models.myt5": ["MyT5Tokenizer"], "models.nemotron": ["NemotronConfig"], "models.nllb": [], "models.nllb_moe": ["NllbMoeConfig"], "models.nougat": ["NougatProcessor"], "models.nystromformer": ["NystromformerConfig"], "models.olmo": ["OlmoConfig"], + "models.olmo2": ["Olmo2Config"], + "models.olmoe": ["OlmoeConfig"], + "models.omdet_turbo": [ + "OmDetTurboConfig", + "OmDetTurboProcessor", + ], "models.oneformer": [ "OneFormerConfig", "OneFormerProcessor", @@ -638,6 +670,7 @@ "models.persimmon": ["PersimmonConfig"], "models.phi": ["PhiConfig"], "models.phi3": ["Phi3Config"], + "models.phimoe": ["PhimoeConfig"], "models.phobert": ["PhobertTokenizer"], "models.pix2struct": [ "Pix2StructConfig", @@ -645,6 +678,7 @@ "Pix2StructTextConfig", "Pix2StructVisionConfig", ], + "models.pixtral": ["PixtralProcessor", "PixtralVisionConfig"], "models.plbart": ["PLBartConfig"], "models.poolformer": ["PoolFormerConfig"], "models.pop2piano": ["Pop2PianoConfig"], @@ -664,6 +698,10 @@ "Qwen2AudioProcessor", ], "models.qwen2_moe": ["Qwen2MoeConfig"], + "models.qwen2_vl": [ + "Qwen2VLConfig", + "Qwen2VLProcessor", + ], "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"], "models.recurrent_gemma": ["RecurrentGemmaConfig"], "models.reformer": ["ReformerConfig"], @@ -825,6 +863,7 @@ "models.xmod": ["XmodConfig"], "models.yolos": ["YolosConfig"], "models.yoso": ["YosoConfig"], + "models.zamba": ["ZambaConfig"], "models.zoedepth": ["ZoeDepthConfig"], "onnx": [], "pipelines": [ @@ -838,6 +877,7 @@ "ImageClassificationPipeline", "ImageFeatureExtractionPipeline", "ImageSegmentationPipeline", + "ImageTextToTextPipeline", "ImageToImagePipeline", "ImageToTextPipeline", "JsonPipelineDataFormat", @@ -912,7 +952,6 @@ "is_av_available", "is_bitsandbytes_available", "is_datasets_available", - "is_decord_available", "is_faiss_available", "is_flax_available", "is_keras_nlp_available", @@ -945,7 +984,9 @@ "utils.quantization_config": [ "AqlmConfig", "AwqConfig", + "BitNetConfig", "BitsAndBytesConfig", + "CompressedTensorsConfig", "EetqConfig", "FbgemmFp8Config", "GPTQConfig", @@ -1141,6 +1182,7 @@ _import_structure["image_processing_base"] = ["ImageProcessingMixin"] _import_structure["image_processing_utils"] = ["BaseImageProcessor"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] + _import_structure["models.aria"].extend(["AriaImageProcessor"]) _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"]) _import_structure["models.bit"].extend(["BitImageProcessor"]) _import_structure["models.blip"].extend(["BlipImageProcessor"]) @@ -1170,6 +1212,7 @@ _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"]) _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"]) + _import_structure["models.idefics3"].extend(["Idefics3ImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) _import_structure["models.instructblipvideo"].extend(["InstructBlipVideoImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) @@ -1177,8 +1220,12 @@ _import_structure["models.levit"].extend(["LevitFeatureExtractor", "LevitImageProcessor"]) _import_structure["models.llava_next"].append("LlavaNextImageProcessor") _import_structure["models.llava_next_video"].append("LlavaNextVideoImageProcessor") + _import_structure["models.llava_onevision"].extend( + ["LlavaOnevisionImageProcessor", "LlavaOnevisionVideoProcessor"] + ) _import_structure["models.mask2former"].append("Mask2FormerImageProcessor") _import_structure["models.maskformer"].extend(["MaskFormerFeatureExtractor", "MaskFormerImageProcessor"]) + _import_structure["models.mllama"].extend(["MllamaImageProcessor"]) _import_structure["models.mobilenet_v1"].extend(["MobileNetV1FeatureExtractor", "MobileNetV1ImageProcessor"]) _import_structure["models.mobilenet_v2"].extend(["MobileNetV2FeatureExtractor", "MobileNetV2ImageProcessor"]) _import_structure["models.mobilevit"].extend(["MobileViTFeatureExtractor", "MobileViTImageProcessor"]) @@ -1188,8 +1235,10 @@ _import_structure["models.owlvit"].extend(["OwlViTFeatureExtractor", "OwlViTImageProcessor"]) _import_structure["models.perceiver"].extend(["PerceiverFeatureExtractor", "PerceiverImageProcessor"]) _import_structure["models.pix2struct"].extend(["Pix2StructImageProcessor"]) + _import_structure["models.pixtral"].append("PixtralImageProcessor") _import_structure["models.poolformer"].extend(["PoolFormerFeatureExtractor", "PoolFormerImageProcessor"]) _import_structure["models.pvt"].extend(["PvtImageProcessor"]) + _import_structure["models.qwen2_vl"].extend(["Qwen2VLImageProcessor"]) _import_structure["models.rt_detr"].extend(["RTDetrImageProcessor"]) _import_structure["models.sam"].extend(["SamImageProcessor"]) _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"]) @@ -1218,6 +1267,10 @@ ] else: _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"] + _import_structure["models.deformable_detr"].append("DeformableDetrImageProcessorFast") + _import_structure["models.detr"].append("DetrImageProcessorFast") + _import_structure["models.pixtral"].append("PixtralImageProcessorFast") + _import_structure["models.rt_detr"].append("RTDetrImageProcessorFast") _import_structure["models.vit"].append("ViTImageProcessorFast") # PyTorch-backed objects @@ -1241,6 +1294,7 @@ "HybridCache", "MambaCache", "OffloadedCache", + "OffloadedStaticCache", "QuantizedCache", "QuantizedCacheConfig", "QuantoQuantizedCache", @@ -1262,6 +1316,8 @@ _import_structure["generation"].extend( [ "AlternatingCodebooksLogitsProcessor", + "BayesianDetectorConfig", + "BayesianDetectorModel", "BeamScorer", "BeamSearchScorer", "ClassifierFreeGuidanceLogitsProcessor", @@ -1277,7 +1333,6 @@ "ExponentialDecayLengthPenalty", "ForcedBOSTokenLogitsProcessor", "ForcedEOSTokenLogitsProcessor", - "ForceTokensLogitsProcessor", "GenerationMixin", "HammingDiversityLogitsProcessor", "InfNanRemoveLogitsProcessor", @@ -1301,6 +1356,9 @@ "StopStringCriteria", "SuppressTokensAtBeginLogitsProcessor", "SuppressTokensLogitsProcessor", + "SynthIDTextWatermarkDetector", + "SynthIDTextWatermarkingConfig", + "SynthIDTextWatermarkLogitsProcessor", "TemperatureLogitsWarper", "TopKLogitsWarper", "TopPLogitsWarper", @@ -1311,6 +1369,13 @@ "WhisperTimeStampLogitsProcessor", ] ) + + # PyTorch domain libraries integration + _import_structure["integrations.executorch"] = [ + "TorchExportableModuleWithStaticCache", + "convert_and_export_with_cache", + ] + _import_structure["modeling_flash_attention_utils"] = [] _import_structure["modeling_outputs"] = [] _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS"] @@ -1340,7 +1405,6 @@ "AlignVisionModel", ] ) - _import_structure["models.altclip"].extend( [ "AltCLIPModel", @@ -1349,6 +1413,15 @@ "AltCLIPVisionModel", ] ) + _import_structure["models.aria"].extend( + [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextForCausalLM", + "AriaTextModel", + "AriaTextPreTrainedModel", + ] + ) _import_structure["models.audio_spectrogram_transformer"].extend( [ "ASTForAudioClassification", @@ -1370,6 +1443,7 @@ "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", "MODEL_FOR_IMAGE_MAPPING", "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", "MODEL_FOR_KEYPOINT_DETECTION_MAPPING", @@ -1411,6 +1485,7 @@ "AutoModelForDocumentQuestionAnswering", "AutoModelForImageClassification", "AutoModelForImageSegmentation", + "AutoModelForImageTextToText", "AutoModelForImageToImage", "AutoModelForInstanceSegmentation", "AutoModelForKeypointDetection", @@ -1488,7 +1563,6 @@ "BertForQuestionAnswering", "BertForSequenceClassification", "BertForTokenClassification", - "BertLayer", "BertLMHeadModel", "BertModel", "BertPreTrainedModel", @@ -1512,7 +1586,6 @@ "BigBirdForQuestionAnswering", "BigBirdForSequenceClassification", "BigBirdForTokenClassification", - "BigBirdLayer", "BigBirdModel", "BigBirdPreTrainedModel", "load_tf_weights_in_big_bird", @@ -1575,10 +1648,13 @@ _import_structure["models.blip_2"].extend( [ "Blip2ForConditionalGeneration", + "Blip2ForImageTextRetrieval", "Blip2Model", "Blip2PreTrainedModel", "Blip2QFormerModel", + "Blip2TextModelWithProjection", "Blip2VisionModel", + "Blip2VisionModelWithProjection", ] ) _import_structure["models.bloom"].extend( @@ -1628,7 +1704,6 @@ "CanineForQuestionAnswering", "CanineForSequenceClassification", "CanineForTokenClassification", - "CanineLayer", "CanineModel", "CaninePreTrainedModel", "load_tf_weights_in_canine", @@ -1715,7 +1790,6 @@ "ConvBertForQuestionAnswering", "ConvBertForSequenceClassification", "ConvBertForTokenClassification", - "ConvBertLayer", "ConvBertModel", "ConvBertPreTrainedModel", "load_tf_weights_in_convbert", @@ -1944,7 +2018,6 @@ "QDQBertForQuestionAnswering", "QDQBertForSequenceClassification", "QDQBertForTokenClassification", - "QDQBertLayer", "QDQBertLMHeadModel", "QDQBertModel", "QDQBertPreTrainedModel", @@ -2204,7 +2277,6 @@ "FNetForQuestionAnswering", "FNetForSequenceClassification", "FNetForTokenClassification", - "FNetLayer", "FNetModel", "FNetPreTrainedModel", ] @@ -2260,6 +2332,15 @@ "GitVisionModel", ] ) + _import_structure["models.glm"].extend( + [ + "GlmForCausalLM", + "GlmForSequenceClassification", + "GlmForTokenClassification", + "GlmModel", + "GlmPreTrainedModel", + ] + ) _import_structure["models.glpn"].extend( [ "GLPNForDepthEstimation", @@ -2305,7 +2386,6 @@ "GPTNeoXForQuestionAnswering", "GPTNeoXForSequenceClassification", "GPTNeoXForTokenClassification", - "GPTNeoXLayer", "GPTNeoXModel", "GPTNeoXPreTrainedModel", ] @@ -2313,7 +2393,6 @@ _import_structure["models.gpt_neox_japanese"].extend( [ "GPTNeoXJapaneseForCausalLM", - "GPTNeoXJapaneseLayer", "GPTNeoXJapaneseModel", "GPTNeoXJapanesePreTrainedModel", ] @@ -2327,6 +2406,20 @@ "GPTJPreTrainedModel", ] ) + _import_structure["models.granite"].extend( + [ + "GraniteForCausalLM", + "GraniteModel", + "GranitePreTrainedModel", + ] + ) + _import_structure["models.granitemoe"].extend( + [ + "GraniteMoeForCausalLM", + "GraniteMoeModel", + "GraniteMoePreTrainedModel", + ] + ) _import_structure["models.grounding_dino"].extend( [ "GroundingDinoForObjectDetection", @@ -2386,6 +2479,23 @@ "Idefics2Processor", ] ) + _import_structure["models.idefics3"].extend( + [ + "Idefics3ForConditionalGeneration", + "Idefics3Model", + "Idefics3PreTrainedModel", + "Idefics3Processor", + "Idefics3VisionConfig", + "Idefics3VisionTransformer", + ] + ) + _import_structure["models.ijepa"].extend( + [ + "IJepaForImageClassification", + "IJepaModel", + "IJepaPreTrainedModel", + ] + ) _import_structure["models.imagegpt"].extend( [ "ImageGPTForCausalImageModeling", @@ -2523,6 +2633,12 @@ "LlavaNextVideoPreTrainedModel", ] ) + _import_structure["models.llava_onevision"].extend( + [ + "LlavaOnevisionForConditionalGeneration", + "LlavaOnevisionPreTrainedModel", + ] + ) _import_structure["models.longformer"].extend( [ "LongformerForMaskedLM", @@ -2532,7 +2648,6 @@ "LongformerForTokenClassification", "LongformerModel", "LongformerPreTrainedModel", - "LongformerSelfAttention", ] ) _import_structure["models.longt5"].extend( @@ -2565,7 +2680,6 @@ "LxmertModel", "LxmertPreTrainedModel", "LxmertVisualFeatureEncoder", - "LxmertXLayer", ] ) _import_structure["models.m2m_100"].extend( @@ -2589,7 +2703,9 @@ "Mamba2PreTrainedModel", ] ) - _import_structure["models.marian"].extend(["MarianForCausalLM", "MarianModel", "MarianMTModel"]) + _import_structure["models.marian"].extend( + ["MarianForCausalLM", "MarianModel", "MarianMTModel", "MarianPreTrainedModel"] + ) _import_structure["models.markuplm"].extend( [ "MarkupLMForQuestionAnswering", @@ -2645,9 +2761,16 @@ "MgpstrPreTrainedModel", ] ) + _import_structure["models.mimi"].extend( + [ + "MimiModel", + "MimiPreTrainedModel", + ] + ) _import_structure["models.mistral"].extend( [ "MistralForCausalLM", + "MistralForQuestionAnswering", "MistralForSequenceClassification", "MistralForTokenClassification", "MistralModel", @@ -2657,12 +2780,23 @@ _import_structure["models.mixtral"].extend( [ "MixtralForCausalLM", + "MixtralForQuestionAnswering", "MixtralForSequenceClassification", "MixtralForTokenClassification", "MixtralModel", "MixtralPreTrainedModel", ] ) + _import_structure["models.mllama"].extend( + [ + "MllamaForCausalLM", + "MllamaForConditionalGeneration", + "MllamaPreTrainedModel", + "MllamaProcessor", + "MllamaTextModel", + "MllamaVisionModel", + ] + ) _import_structure["models.mobilebert"].extend( [ "MobileBertForMaskedLM", @@ -2672,7 +2806,6 @@ "MobileBertForQuestionAnswering", "MobileBertForSequenceClassification", "MobileBertForTokenClassification", - "MobileBertLayer", "MobileBertModel", "MobileBertPreTrainedModel", "load_tf_weights_in_mobilebert", @@ -2711,6 +2844,14 @@ "MobileViTV2PreTrainedModel", ] ) + _import_structure["models.moshi"].extend( + [ + "MoshiForCausalLM", + "MoshiForConditionalGeneration", + "MoshiModel", + "MoshiPreTrainedModel", + ] + ) _import_structure["models.mpnet"].extend( [ "MPNetForMaskedLM", @@ -2718,7 +2859,6 @@ "MPNetForQuestionAnswering", "MPNetForSequenceClassification", "MPNetForTokenClassification", - "MPNetLayer", "MPNetModel", "MPNetPreTrainedModel", ] @@ -2808,7 +2948,6 @@ "NystromformerForQuestionAnswering", "NystromformerForSequenceClassification", "NystromformerForTokenClassification", - "NystromformerLayer", "NystromformerModel", "NystromformerPreTrainedModel", ] @@ -2820,6 +2959,26 @@ "OlmoPreTrainedModel", ] ) + _import_structure["models.olmo2"].extend( + [ + "Olmo2ForCausalLM", + "Olmo2Model", + "Olmo2PreTrainedModel", + ] + ) + _import_structure["models.olmoe"].extend( + [ + "OlmoeForCausalLM", + "OlmoeModel", + "OlmoePreTrainedModel", + ] + ) + _import_structure["models.omdet_turbo"].extend( + [ + "OmDetTurboForObjectDetection", + "OmDetTurboPreTrainedModel", + ] + ) _import_structure["models.oneformer"].extend( [ "OneFormerForUniversalSegmentation", @@ -2915,7 +3074,6 @@ "PerceiverForMultimodalAutoencoding", "PerceiverForOpticalFlow", "PerceiverForSequenceClassification", - "PerceiverLayer", "PerceiverModel", "PerceiverPreTrainedModel", ] @@ -2947,6 +3105,14 @@ "Phi3PreTrainedModel", ] ) + _import_structure["models.phimoe"].extend( + [ + "PhimoeForCausalLM", + "PhimoeForSequenceClassification", + "PhimoeModel", + "PhimoePreTrainedModel", + ] + ) _import_structure["models.pix2struct"].extend( [ "Pix2StructForConditionalGeneration", @@ -2955,6 +3121,7 @@ "Pix2StructVisionModel", ] ) + _import_structure["models.pixtral"].extend(["PixtralPreTrainedModel", "PixtralVisionModel"]) _import_structure["models.plbart"].extend( [ "PLBartForCausalLM", @@ -3005,6 +3172,7 @@ _import_structure["models.qwen2"].extend( [ "Qwen2ForCausalLM", + "Qwen2ForQuestionAnswering", "Qwen2ForSequenceClassification", "Qwen2ForTokenClassification", "Qwen2Model", @@ -3021,12 +3189,20 @@ _import_structure["models.qwen2_moe"].extend( [ "Qwen2MoeForCausalLM", + "Qwen2MoeForQuestionAnswering", "Qwen2MoeForSequenceClassification", "Qwen2MoeForTokenClassification", "Qwen2MoeModel", "Qwen2MoePreTrainedModel", ] ) + _import_structure["models.qwen2_vl"].extend( + [ + "Qwen2VLForConditionalGeneration", + "Qwen2VLModel", + "Qwen2VLPreTrainedModel", + ] + ) _import_structure["models.rag"].extend( [ "RagModel", @@ -3044,11 +3220,9 @@ ) _import_structure["models.reformer"].extend( [ - "ReformerAttention", "ReformerForMaskedLM", "ReformerForQuestionAnswering", "ReformerForSequenceClassification", - "ReformerLayer", "ReformerModel", "ReformerModelWithLMHead", "ReformerPreTrainedModel", @@ -3069,7 +3243,6 @@ "RemBertForQuestionAnswering", "RemBertForSequenceClassification", "RemBertForTokenClassification", - "RemBertLayer", "RemBertModel", "RemBertPreTrainedModel", "load_tf_weights_in_rembert", @@ -3116,7 +3289,6 @@ "RoCBertForQuestionAnswering", "RoCBertForSequenceClassification", "RoCBertForTokenClassification", - "RoCBertLayer", "RoCBertModel", "RoCBertPreTrainedModel", "load_tf_weights_in_roc_bert", @@ -3130,7 +3302,6 @@ "RoFormerForQuestionAnswering", "RoFormerForSequenceClassification", "RoFormerForTokenClassification", - "RoFormerLayer", "RoFormerModel", "RoFormerPreTrainedModel", "load_tf_weights_in_roformer", @@ -3187,7 +3358,6 @@ "SegformerDecodeHead", "SegformerForImageClassification", "SegformerForSemanticSegmentation", - "SegformerLayer", "SegformerModel", "SegformerPreTrainedModel", ] @@ -3246,7 +3416,6 @@ [ "SplinterForPreTraining", "SplinterForQuestionAnswering", - "SplinterLayer", "SplinterModel", "SplinterPreTrainedModel", ] @@ -3259,7 +3428,6 @@ "SqueezeBertForSequenceClassification", "SqueezeBertForTokenClassification", "SqueezeBertModel", - "SqueezeBertModule", "SqueezeBertPreTrainedModel", ] ) @@ -3458,7 +3626,6 @@ "ViltForMaskedLM", "ViltForQuestionAnswering", "ViltForTokenClassification", - "ViltLayer", "ViltModel", "ViltPreTrainedModel", ] @@ -3478,7 +3645,6 @@ "VisualBertForQuestionAnswering", "VisualBertForRegionToPhraseAlignment", "VisualBertForVisualReasoning", - "VisualBertLayer", "VisualBertModel", "VisualBertPreTrainedModel", ] @@ -3494,7 +3660,6 @@ _import_structure["models.vit_mae"].extend( [ "ViTMAEForPreTraining", - "ViTMAELayer", "ViTMAEModel", "ViTMAEPreTrainedModel", ] @@ -3674,11 +3839,18 @@ "YosoForQuestionAnswering", "YosoForSequenceClassification", "YosoForTokenClassification", - "YosoLayer", "YosoModel", "YosoPreTrainedModel", ] ) + _import_structure["models.zamba"].extend( + [ + "ZambaForCausalLM", + "ZambaForSequenceClassification", + "ZambaModel", + "ZambaPreTrainedModel", + ] + ) _import_structure["models.zoedepth"].extend( [ "ZoeDepthForDepthEstimation", @@ -3821,7 +3993,6 @@ ) _import_structure["models.bert"].extend( [ - "TFBertEmbeddings", "TFBertForMaskedLM", "TFBertForMultipleChoice", "TFBertForNextSentencePrediction", @@ -3887,7 +4058,6 @@ "TFConvBertForQuestionAnswering", "TFConvBertForSequenceClassification", "TFConvBertForTokenClassification", - "TFConvBertLayer", "TFConvBertModel", "TFConvBertPreTrainedModel", ] @@ -4118,7 +4288,6 @@ "TFLongformerForTokenClassification", "TFLongformerModel", "TFLongformerPreTrainedModel", - "TFLongformerSelfAttention", ] ) _import_structure["models.lxmert"].extend( @@ -4219,7 +4388,6 @@ "TFRemBertForQuestionAnswering", "TFRemBertForSequenceClassification", "TFRemBertForTokenClassification", - "TFRemBertLayer", "TFRemBertModel", "TFRemBertPreTrainedModel", ] @@ -4265,7 +4433,6 @@ "TFRoFormerForQuestionAnswering", "TFRoFormerForSequenceClassification", "TFRoFormerForTokenClassification", - "TFRoFormerLayer", "TFRoFormerModel", "TFRoFormerPreTrainedModel", ] @@ -4792,7 +4959,8 @@ from .agents import ( Agent, CodeAgent, - HfEngine, + HfApiEngine, + ManagedAgent, PipelineTool, ReactAgent, ReactCodeAgent, @@ -4800,9 +4968,11 @@ Tool, Toolbox, ToolCollection, + TransformersEngine, launch_gradio_demo, load_tool, stream_to_gradio, + tool, ) from .configuration_utils import PretrainedConfig @@ -4846,7 +5016,7 @@ from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin # Generation - from .generation import GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig + from .generation import CompileConfig, GenerationConfig, TextIteratorStreamer, TextStreamer, WatermarkingConfig from .hf_argparser import HfArgumentParser # Integrations @@ -4889,6 +5059,11 @@ AltCLIPTextConfig, AltCLIPVisionConfig, ) + from .models.aria import ( + AriaConfig, + AriaProcessor, + AriaTextConfig, + ) from .models.audio_spectrogram_transformer import ( ASTConfig, ASTFeatureExtractor, @@ -5198,6 +5373,7 @@ GitProcessor, GitVisionConfig, ) + from .models.glm import GlmConfig from .models.glpn import GLPNConfig from .models.gpt2 import ( GPT2Config, @@ -5212,6 +5388,8 @@ GPTNeoXJapaneseConfig, ) from .models.gptj import GPTJConfig + from .models.granite import GraniteConfig + from .models.granitemoe import GraniteMoeConfig from .models.grounding_dino import ( GroundingDinoConfig, GroundingDinoProcessor, @@ -5229,6 +5407,8 @@ IdeficsConfig, ) from .models.idefics2 import Idefics2Config + from .models.idefics3 import Idefics3Config + from .models.ijepa import IJepaConfig from .models.imagegpt import ImageGPTConfig from .models.informer import InformerConfig from .models.instructblip import ( @@ -5284,6 +5464,10 @@ LlavaNextVideoConfig, LlavaNextVideoProcessor, ) + from .models.llava_onevision import ( + LlavaOnevisionConfig, + LlavaOnevisionProcessor, + ) from .models.longformer import ( LongformerConfig, LongformerTokenizer, @@ -5323,8 +5507,15 @@ MgpstrProcessor, MgpstrTokenizer, ) + from .models.mimi import ( + MimiConfig, + ) from .models.mistral import MistralConfig from .models.mixtral import MixtralConfig + from .models.mllama import ( + MllamaConfig, + MllamaProcessor, + ) from .models.mobilebert import ( MobileBertConfig, MobileBertTokenizer, @@ -5341,6 +5532,10 @@ from .models.mobilevitv2 import ( MobileViTV2Config, ) + from .models.moshi import ( + MoshiConfig, + MoshiDepthConfig, + ) from .models.mpnet import ( MPNetConfig, MPNetTokenizer, @@ -5357,6 +5552,7 @@ MusicgenMelodyDecoderConfig, ) from .models.mvp import MvpConfig, MvpTokenizer + from .models.myt5 import MyT5Tokenizer from .models.nemotron import NemotronConfig from .models.nllb_moe import NllbMoeConfig from .models.nougat import NougatProcessor @@ -5364,6 +5560,12 @@ NystromformerConfig, ) from .models.olmo import OlmoConfig + from .models.olmo2 import Olmo2Config + from .models.olmoe import OlmoeConfig + from .models.omdet_turbo import ( + OmDetTurboConfig, + OmDetTurboProcessor, + ) from .models.oneformer import ( OneFormerConfig, OneFormerProcessor, @@ -5408,6 +5610,7 @@ ) from .models.phi import PhiConfig from .models.phi3 import Phi3Config + from .models.phimoe import PhimoeConfig from .models.phobert import PhobertTokenizer from .models.pix2struct import ( Pix2StructConfig, @@ -5415,6 +5618,10 @@ Pix2StructTextConfig, Pix2StructVisionConfig, ) + from .models.pixtral import ( + PixtralProcessor, + PixtralVisionConfig, + ) from .models.plbart import PLBartConfig from .models.poolformer import ( PoolFormerConfig, @@ -5435,6 +5642,10 @@ Qwen2AudioProcessor, ) from .models.qwen2_moe import Qwen2MoeConfig + from .models.qwen2_vl import ( + Qwen2VLConfig, + Qwen2VLProcessor, + ) from .models.rag import RagConfig, RagRetriever, RagTokenizer from .models.recurrent_gemma import RecurrentGemmaConfig from .models.reformer import ReformerConfig @@ -5624,6 +5835,7 @@ from .models.xmod import XmodConfig from .models.yolos import YolosConfig from .models.yoso import YosoConfig + from .models.zamba import ZambaConfig from .models.zoedepth import ZoeDepthConfig # Pipelines @@ -5638,6 +5850,7 @@ ImageClassificationPipeline, ImageFeatureExtractionPipeline, ImageSegmentationPipeline, + ImageTextToTextPipeline, ImageToImagePipeline, ImageToTextPipeline, JsonPipelineDataFormat, @@ -5716,7 +5929,6 @@ is_av_available, is_bitsandbytes_available, is_datasets_available, - is_decord_available, is_faiss_available, is_flax_available, is_keras_nlp_available, @@ -5751,7 +5963,9 @@ from .utils.quantization_config import ( AqlmConfig, AwqConfig, + BitNetConfig, BitsAndBytesConfig, + CompressedTensorsConfig, EetqConfig, FbgemmFp8Config, GPTQConfig, @@ -5784,7 +5998,8 @@ from .models.llama import LlamaTokenizer from .models.m2m_100 import M2M100Tokenizer from .models.marian import MarianTokenizer - from .models.mbart import MBart50Tokenizer, MBartTokenizer + from .models.mbart import MBartTokenizer + from .models.mbart50 import MBart50Tokenizer from .models.mluke import MLukeTokenizer from .models.mt5 import MT5Tokenizer from .models.nllb import NllbTokenizer @@ -5913,6 +6128,7 @@ from .image_processing_base import ImageProcessingMixin from .image_processing_utils import BaseImageProcessor from .image_utils import ImageFeatureExtractionMixin + from .models.aria import AriaImageProcessor from .models.beit import BeitFeatureExtractor, BeitImageProcessor from .models.bit import BitImageProcessor from .models.blip import BlipImageProcessor @@ -5928,10 +6144,7 @@ ConditionalDetrImageProcessor, ) from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor - from .models.deformable_detr import ( - DeformableDetrFeatureExtractor, - DeformableDetrImageProcessor, - ) + from .models.deformable_detr import DeformableDetrFeatureExtractor, DeformableDetrImageProcessor from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor from .models.deprecated.deta import DetaImageProcessor from .models.deprecated.efficientformer import EfficientFormerImageProcessor @@ -5951,6 +6164,7 @@ from .models.grounding_dino import GroundingDinoImageProcessor from .models.idefics import IdeficsImageProcessor from .models.idefics2 import Idefics2ImageProcessor + from .models.idefics3 import Idefics3ImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor from .models.instructblipvideo import InstructBlipVideoImageProcessor from .models.layoutlmv2 import ( @@ -5964,11 +6178,13 @@ from .models.levit import LevitFeatureExtractor, LevitImageProcessor from .models.llava_next import LlavaNextImageProcessor from .models.llava_next_video import LlavaNextVideoImageProcessor + from .models.llava_onevision import LlavaOnevisionImageProcessor, LlavaOnevisionVideoProcessor from .models.mask2former import Mask2FormerImageProcessor from .models.maskformer import ( MaskFormerFeatureExtractor, MaskFormerImageProcessor, ) + from .models.mllama import MllamaImageProcessor from .models.mobilenet_v1 import ( MobileNetV1FeatureExtractor, MobileNetV1ImageProcessor, @@ -5984,11 +6200,13 @@ from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor from .models.pix2struct import Pix2StructImageProcessor + from .models.pixtral import PixtralImageProcessor from .models.poolformer import ( PoolFormerFeatureExtractor, PoolFormerImageProcessor, ) from .models.pvt import PvtImageProcessor + from .models.qwen2_vl import Qwen2VLImageProcessor from .models.rt_detr import RTDetrImageProcessor from .models.sam import SamImageProcessor from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor @@ -6013,6 +6231,10 @@ from .utils.dummy_torchvision_objects import * else: from .image_processing_utils_fast import BaseImageProcessorFast + from .models.deformable_detr import DeformableDetrImageProcessorFast + from .models.detr import DetrImageProcessorFast + from .models.pixtral import PixtralImageProcessorFast + from .models.rt_detr import RTDetrImageProcessorFast from .models.vit import ViTImageProcessorFast # Modeling @@ -6034,6 +6256,7 @@ HybridCache, MambaCache, OffloadedCache, + OffloadedStaticCache, QuantizedCache, QuantizedCacheConfig, QuantoQuantizedCache, @@ -6054,6 +6277,8 @@ ) from .generation import ( AlternatingCodebooksLogitsProcessor, + BayesianDetectorConfig, + BayesianDetectorModel, BeamScorer, BeamSearchScorer, ClassifierFreeGuidanceLogitsProcessor, @@ -6069,7 +6294,6 @@ ExponentialDecayLengthPenalty, ForcedBOSTokenLogitsProcessor, ForcedEOSTokenLogitsProcessor, - ForceTokensLogitsProcessor, GenerationMixin, HammingDiversityLogitsProcessor, InfNanRemoveLogitsProcessor, @@ -6093,6 +6317,9 @@ StopStringCriteria, SuppressTokensAtBeginLogitsProcessor, SuppressTokensLogitsProcessor, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper, @@ -6102,6 +6329,10 @@ WatermarkLogitsProcessor, WhisperTimeStampLogitsProcessor, ) + from .integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) from .modeling_rope_utils import ROPE_INIT_FUNCTIONS from .modeling_utils import PreTrainedModel from .models.albert import ( @@ -6127,6 +6358,13 @@ AltCLIPTextModel, AltCLIPVisionModel, ) + from .models.aria import ( + AriaForConditionalGeneration, + AriaPreTrainedModel, + AriaTextForCausalLM, + AriaTextModel, + AriaTextPreTrainedModel, + ) from .models.audio_spectrogram_transformer import ( ASTForAudioClassification, ASTModel, @@ -6145,6 +6383,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING, MODEL_FOR_IMAGE_MAPPING, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING, + MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING, MODEL_FOR_KEYPOINT_DETECTION_MAPPING, @@ -6186,6 +6425,7 @@ AutoModelForDocumentQuestionAnswering, AutoModelForImageClassification, AutoModelForImageSegmentation, + AutoModelForImageTextToText, AutoModelForImageToImage, AutoModelForInstanceSegmentation, AutoModelForKeypointDetection, @@ -6253,7 +6493,6 @@ BertForQuestionAnswering, BertForSequenceClassification, BertForTokenClassification, - BertLayer, BertLMHeadModel, BertModel, BertPreTrainedModel, @@ -6273,7 +6512,6 @@ BigBirdForQuestionAnswering, BigBirdForSequenceClassification, BigBirdForTokenClassification, - BigBirdLayer, BigBirdModel, BigBirdPreTrainedModel, load_tf_weights_in_big_bird, @@ -6322,10 +6560,13 @@ ) from .models.blip_2 import ( Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, Blip2Model, Blip2PreTrainedModel, Blip2QFormerModel, + Blip2TextModelWithProjection, Blip2VisionModel, + Blip2VisionModelWithProjection, ) from .models.bloom import ( BloomForCausalLM, @@ -6365,7 +6606,6 @@ CanineForQuestionAnswering, CanineForSequenceClassification, CanineForTokenClassification, - CanineLayer, CanineModel, CaninePreTrainedModel, load_tf_weights_in_canine, @@ -6438,7 +6678,6 @@ ConvBertForQuestionAnswering, ConvBertForSequenceClassification, ConvBertForTokenClassification, - ConvBertLayer, ConvBertModel, ConvBertPreTrainedModel, load_tf_weights_in_convbert, @@ -6623,7 +6862,6 @@ QDQBertForQuestionAnswering, QDQBertForSequenceClassification, QDQBertForTokenClassification, - QDQBertLayer, QDQBertLMHeadModel, QDQBertModel, QDQBertPreTrainedModel, @@ -6828,7 +7066,6 @@ FNetForQuestionAnswering, FNetForSequenceClassification, FNetForTokenClassification, - FNetLayer, FNetModel, FNetPreTrainedModel, ) @@ -6880,6 +7117,13 @@ GitPreTrainedModel, GitVisionModel, ) + from .models.glm import ( + GlmForCausalLM, + GlmForSequenceClassification, + GlmForTokenClassification, + GlmModel, + GlmPreTrainedModel, + ) from .models.glpn import ( GLPNForDepthEstimation, GLPNModel, @@ -6916,13 +7160,11 @@ GPTNeoXForQuestionAnswering, GPTNeoXForSequenceClassification, GPTNeoXForTokenClassification, - GPTNeoXLayer, GPTNeoXModel, GPTNeoXPreTrainedModel, ) from .models.gpt_neox_japanese import ( GPTNeoXJapaneseForCausalLM, - GPTNeoXJapaneseLayer, GPTNeoXJapaneseModel, GPTNeoXJapanesePreTrainedModel, ) @@ -6933,6 +7175,16 @@ GPTJModel, GPTJPreTrainedModel, ) + from .models.granite import ( + GraniteForCausalLM, + GraniteModel, + GranitePreTrainedModel, + ) + from .models.granitemoe import ( + GraniteMoeForCausalLM, + GraniteMoeModel, + GraniteMoePreTrainedModel, + ) from .models.grounding_dino import ( GroundingDinoForObjectDetection, GroundingDinoModel, @@ -6978,6 +7230,19 @@ Idefics2PreTrainedModel, Idefics2Processor, ) + from .models.idefics3 import ( + Idefics3ForConditionalGeneration, + Idefics3Model, + Idefics3PreTrainedModel, + Idefics3Processor, + Idefics3VisionConfig, + Idefics3VisionTransformer, + ) + from .models.ijepa import ( + IJepaForImageClassification, + IJepaModel, + IJepaPreTrainedModel, + ) from .models.imagegpt import ( ImageGPTForCausalImageModeling, ImageGPTForImageClassification, @@ -7081,6 +7346,10 @@ LlavaNextVideoForConditionalGeneration, LlavaNextVideoPreTrainedModel, ) + from .models.llava_onevision import ( + LlavaOnevisionForConditionalGeneration, + LlavaOnevisionPreTrainedModel, + ) from .models.longformer import ( LongformerForMaskedLM, LongformerForMultipleChoice, @@ -7089,7 +7358,6 @@ LongformerForTokenClassification, LongformerModel, LongformerPreTrainedModel, - LongformerSelfAttention, ) from .models.longt5 import ( LongT5EncoderModel, @@ -7116,7 +7384,6 @@ LxmertModel, LxmertPreTrainedModel, LxmertVisualFeatureEncoder, - LxmertXLayer, ) from .models.m2m_100 import ( M2M100ForConditionalGeneration, @@ -7133,7 +7400,7 @@ Mamba2Model, Mamba2PreTrainedModel, ) - from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel + from .models.marian import MarianForCausalLM, MarianModel, MarianMTModel, MarianPreTrainedModel from .models.markuplm import ( MarkupLMForQuestionAnswering, MarkupLMForSequenceClassification, @@ -7177,8 +7444,13 @@ MgpstrModel, MgpstrPreTrainedModel, ) + from .models.mimi import ( + MimiModel, + MimiPreTrainedModel, + ) from .models.mistral import ( MistralForCausalLM, + MistralForQuestionAnswering, MistralForSequenceClassification, MistralForTokenClassification, MistralModel, @@ -7186,11 +7458,20 @@ ) from .models.mixtral import ( MixtralForCausalLM, + MixtralForQuestionAnswering, MixtralForSequenceClassification, MixtralForTokenClassification, MixtralModel, MixtralPreTrainedModel, ) + from .models.mllama import ( + MllamaForCausalLM, + MllamaForConditionalGeneration, + MllamaPreTrainedModel, + MllamaProcessor, + MllamaTextModel, + MllamaVisionModel, + ) from .models.mobilebert import ( MobileBertForMaskedLM, MobileBertForMultipleChoice, @@ -7199,7 +7480,6 @@ MobileBertForQuestionAnswering, MobileBertForSequenceClassification, MobileBertForTokenClassification, - MobileBertLayer, MobileBertModel, MobileBertPreTrainedModel, load_tf_weights_in_mobilebert, @@ -7229,13 +7509,18 @@ MobileViTV2Model, MobileViTV2PreTrainedModel, ) + from .models.moshi import ( + MoshiForCausalLM, + MoshiForConditionalGeneration, + MoshiModel, + MoshiPreTrainedModel, + ) from .models.mpnet import ( MPNetForMaskedLM, MPNetForMultipleChoice, MPNetForQuestionAnswering, MPNetForSequenceClassification, MPNetForTokenClassification, - MPNetLayer, MPNetModel, MPNetPreTrainedModel, ) @@ -7307,7 +7592,6 @@ NystromformerForQuestionAnswering, NystromformerForSequenceClassification, NystromformerForTokenClassification, - NystromformerLayer, NystromformerModel, NystromformerPreTrainedModel, ) @@ -7316,6 +7600,20 @@ OlmoModel, OlmoPreTrainedModel, ) + from .models.olmo2 import ( + Olmo2ForCausalLM, + Olmo2Model, + Olmo2PreTrainedModel, + ) + from .models.olmoe import ( + OlmoeForCausalLM, + OlmoeModel, + OlmoePreTrainedModel, + ) + from .models.omdet_turbo import ( + OmDetTurboForObjectDetection, + OmDetTurboPreTrainedModel, + ) from .models.oneformer import ( OneFormerForUniversalSegmentation, OneFormerModel, @@ -7390,7 +7688,6 @@ PerceiverForMultimodalAutoencoding, PerceiverForOpticalFlow, PerceiverForSequenceClassification, - PerceiverLayer, PerceiverModel, PerceiverPreTrainedModel, ) @@ -7415,12 +7712,22 @@ Phi3Model, Phi3PreTrainedModel, ) + from .models.phimoe import ( + PhimoeForCausalLM, + PhimoeForSequenceClassification, + PhimoeModel, + PhimoePreTrainedModel, + ) from .models.pix2struct import ( Pix2StructForConditionalGeneration, Pix2StructPreTrainedModel, Pix2StructTextModel, Pix2StructVisionModel, ) + from .models.pixtral import ( + PixtralPreTrainedModel, + PixtralVisionModel, + ) from .models.plbart import ( PLBartForCausalLM, PLBartForConditionalGeneration, @@ -7458,6 +7765,7 @@ ) from .models.qwen2 import ( Qwen2ForCausalLM, + Qwen2ForQuestionAnswering, Qwen2ForSequenceClassification, Qwen2ForTokenClassification, Qwen2Model, @@ -7470,11 +7778,17 @@ ) from .models.qwen2_moe import ( Qwen2MoeForCausalLM, + Qwen2MoeForQuestionAnswering, Qwen2MoeForSequenceClassification, Qwen2MoeForTokenClassification, Qwen2MoeModel, Qwen2MoePreTrainedModel, ) + from .models.qwen2_vl import ( + Qwen2VLForConditionalGeneration, + Qwen2VLModel, + Qwen2VLPreTrainedModel, + ) from .models.rag import ( RagModel, RagPreTrainedModel, @@ -7487,11 +7801,9 @@ RecurrentGemmaPreTrainedModel, ) from .models.reformer import ( - ReformerAttention, ReformerForMaskedLM, ReformerForQuestionAnswering, ReformerForSequenceClassification, - ReformerLayer, ReformerModel, ReformerModelWithLMHead, ReformerPreTrainedModel, @@ -7508,7 +7820,6 @@ RemBertForQuestionAnswering, RemBertForSequenceClassification, RemBertForTokenClassification, - RemBertLayer, RemBertModel, RemBertPreTrainedModel, load_tf_weights_in_rembert, @@ -7547,7 +7858,6 @@ RoCBertForQuestionAnswering, RoCBertForSequenceClassification, RoCBertForTokenClassification, - RoCBertLayer, RoCBertModel, RoCBertPreTrainedModel, load_tf_weights_in_roc_bert, @@ -7559,7 +7869,6 @@ RoFormerForQuestionAnswering, RoFormerForSequenceClassification, RoFormerForTokenClassification, - RoFormerLayer, RoFormerModel, RoFormerPreTrainedModel, load_tf_weights_in_roformer, @@ -7604,7 +7913,6 @@ SegformerDecodeHead, SegformerForImageClassification, SegformerForSemanticSegmentation, - SegformerLayer, SegformerModel, SegformerPreTrainedModel, ) @@ -7649,7 +7957,6 @@ from .models.splinter import ( SplinterForPreTraining, SplinterForQuestionAnswering, - SplinterLayer, SplinterModel, SplinterPreTrainedModel, ) @@ -7660,7 +7967,6 @@ SqueezeBertForSequenceClassification, SqueezeBertForTokenClassification, SqueezeBertModel, - SqueezeBertModule, SqueezeBertPreTrainedModel, ) from .models.stablelm import ( @@ -7809,7 +8115,6 @@ ViltForMaskedLM, ViltForQuestionAnswering, ViltForTokenClassification, - ViltLayer, ViltModel, ViltPreTrainedModel, ) @@ -7825,7 +8130,6 @@ VisualBertForQuestionAnswering, VisualBertForRegionToPhraseAlignment, VisualBertForVisualReasoning, - VisualBertLayer, VisualBertModel, VisualBertPreTrainedModel, ) @@ -7837,7 +8141,6 @@ ) from .models.vit_mae import ( ViTMAEForPreTraining, - ViTMAELayer, ViTMAEModel, ViTMAEPreTrainedModel, ) @@ -7979,10 +8282,15 @@ YosoForQuestionAnswering, YosoForSequenceClassification, YosoForTokenClassification, - YosoLayer, YosoModel, YosoPreTrainedModel, ) + from .models.zamba import ( + ZambaForCausalLM, + ZambaForSequenceClassification, + ZambaModel, + ZambaPreTrainedModel, + ) from .models.zoedepth import ( ZoeDepthForDepthEstimation, ZoeDepthPreTrainedModel, @@ -8113,7 +8421,6 @@ TFBartPretrainedModel, ) from .models.bert import ( - TFBertEmbeddings, TFBertForMaskedLM, TFBertForMultipleChoice, TFBertForNextSentencePrediction, @@ -8167,7 +8474,6 @@ TFConvBertForQuestionAnswering, TFConvBertForSequenceClassification, TFConvBertForTokenClassification, - TFConvBertLayer, TFConvBertModel, TFConvBertPreTrainedModel, ) @@ -8352,7 +8658,6 @@ TFLongformerForTokenClassification, TFLongformerModel, TFLongformerPreTrainedModel, - TFLongformerSelfAttention, ) from .models.lxmert import ( TFLxmertForPreTraining, @@ -8442,7 +8747,6 @@ TFRemBertForQuestionAnswering, TFRemBertForSequenceClassification, TFRemBertForTokenClassification, - TFRemBertLayer, TFRemBertModel, TFRemBertPreTrainedModel, ) @@ -8480,7 +8784,6 @@ TFRoFormerForQuestionAnswering, TFRoFormerForSequenceClassification, TFRoFormerForTokenClassification, - TFRoFormerLayer, TFRoFormerModel, TFRoFormerPreTrainedModel, ) @@ -8927,4 +9230,4 @@ "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. " "Models won't be available and only tokenizers, configuration " "and file/data utilities can be used." - ) \ No newline at end of file + ) diff --git a/src/transformers/commands/add_new_model_like.py b/src/transformers/commands/add_new_model_like.py index eb1986cfa292e1..97f76c07e968ee 100644 --- a/src/transformers/commands/add_new_model_like.py +++ b/src/transformers/commands/add_new_model_like.py @@ -766,6 +766,7 @@ def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None): image_processor_class = image_processor_classes[0] # we take the slow image processor class. else: image_processor_class = image_processor_classes + feature_extractor_class = auto_module.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES.get(model_type, None) processor_class = auto_module.processing_auto.PROCESSOR_MAPPING_NAMES.get(model_type, None) @@ -1714,4 +1715,4 @@ def get_user_input(): ) frameworks = list(set(frameworks.split(" "))) - return (old_model_type, model_patterns, add_copied_from, frameworks, old_checkpoint) \ No newline at end of file + return (old_model_type, model_patterns, add_copied_from, frameworks, old_checkpoint) diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 59ec31a73c0e85..2555b6de47b082 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -16,6 +16,7 @@ albert, align, altclip, + aria, audio_spectrogram_transformer, auto, autoformer, @@ -98,6 +99,7 @@ gemma, gemma2, git, + glm, glpn, gpt2, gpt_bigcode, @@ -106,6 +108,8 @@ gpt_neox_japanese, gpt_sw3, gptj, + granite, + granitemoe, grounding_dino, groupvit, herbert, @@ -114,6 +118,8 @@ ibert, idefics, idefics2, + idefics3, + ijepa, imagegpt, informer, instructblip, @@ -132,6 +138,7 @@ llava, llava_next, llava_next_video, + llava_onevision, longformer, longt5, luke, @@ -148,14 +155,17 @@ megatron_bert, megatron_gpt2, mgp_str, + mimi, mistral, mixtral, + mllama, mluke, mobilebert, mobilenet_v1, mobilenet_v2, mobilevit, mobilevitv2, + moshi, mpnet, mpt, mra, @@ -163,12 +173,16 @@ musicgen, musicgen_melody, mvp, + myt5, nemotron, nllb, nllb_moe, nougat, nystromformer, olmo, + olmo2, + olmoe, + omdet_turbo, oneformer, openai, opt, @@ -183,8 +197,10 @@ persimmon, phi, phi3, + phimoe, phobert, pix2struct, + pixtral, plbart, poolformer, pop2piano, @@ -194,6 +210,7 @@ qwen2, qwen2_audio, qwen2_moe, + qwen2_vl, rag, recurrent_gemma, reformer, @@ -271,5 +288,6 @@ xmod, yolos, yoso, + zamba, zoedepth, -) \ No newline at end of file +) diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 493c02dc32fb08..d93275ffc8788f 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertConfig"), ("align", "AlignConfig"), ("altclip", "AltCLIPConfig"), + ("aria", "AriaConfig"), + ("aria_text", "AriaTextConfig"), ("audio-spectrogram-transformer", "ASTConfig"), ("autoformer", "AutoformerConfig"), ("bark", "BarkConfig"), @@ -60,6 +62,7 @@ ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"), ("clap", "ClapConfig"), ("clip", "CLIPConfig"), + ("clip_text_model", "CLIPTextConfig"), ("clip_vision_model", "CLIPVisionConfig"), ("clipseg", "CLIPSegConfig"), ("clvp", "ClvpConfig"), @@ -114,6 +117,7 @@ ("gemma", "GemmaConfig"), ("gemma2", "Gemma2Config"), ("git", "GitConfig"), + ("glm", "GlmConfig"), ("glpn", "GLPNConfig"), ("gpt-sw3", "GPT2Config"), ("gpt2", "GPT2Config"), @@ -123,6 +127,8 @@ ("gpt_neox_japanese", "GPTNeoXJapaneseConfig"), ("gptj", "GPTJConfig"), ("gptsan-japanese", "GPTSanJapaneseConfig"), + ("granite", "GraniteConfig"), + ("granitemoe", "GraniteMoeConfig"), ("graphormer", "GraphormerConfig"), ("grounding-dino", "GroundingDinoConfig"), ("groupvit", "GroupViTConfig"), @@ -131,6 +137,9 @@ ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), ("idefics2", "Idefics2Config"), + ("idefics3", "Idefics3Config"), + ("idefics3_vision", "Idefics3VisionConfig"), + ("ijepa", "IJepaConfig"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), ("instructblip", "InstructBlipConfig"), @@ -149,6 +158,7 @@ ("llava", "LlavaConfig"), ("llava_next", "LlavaNextConfig"), ("llava_next_video", "LlavaNextVideoConfig"), + ("llava_onevision", "LlavaOnevisionConfig"), ("longformer", "LongformerConfig"), ("longt5", "LongT5Config"), ("luke", "LukeConfig"), @@ -166,13 +176,16 @@ ("mega", "MegaConfig"), ("megatron-bert", "MegatronBertConfig"), ("mgp-str", "MgpstrConfig"), + ("mimi", "MimiConfig"), ("mistral", "MistralConfig"), ("mixtral", "MixtralConfig"), + ("mllama", "MllamaConfig"), ("mobilebert", "MobileBertConfig"), ("mobilenet_v1", "MobileNetV1Config"), ("mobilenet_v2", "MobileNetV2Config"), ("mobilevit", "MobileViTConfig"), ("mobilevitv2", "MobileViTV2Config"), + ("moshi", "MoshiConfig"), ("mpnet", "MPNetConfig"), ("mpt", "MptConfig"), ("mra", "MraConfig"), @@ -187,6 +200,9 @@ ("nougat", "VisionEncoderDecoderConfig"), ("nystromformer", "NystromformerConfig"), ("olmo", "OlmoConfig"), + ("olmo2", "Olmo2Config"), + ("olmoe", "OlmoeConfig"), + ("omdet-turbo", "OmDetTurboConfig"), ("oneformer", "OneFormerConfig"), ("open-llama", "OpenLlamaConfig"), ("openai-gpt", "OpenAIGPTConfig"), @@ -202,7 +218,9 @@ ("persimmon", "PersimmonConfig"), ("phi", "PhiConfig"), ("phi3", "Phi3Config"), + ("phimoe", "PhimoeConfig"), ("pix2struct", "Pix2StructConfig"), + ("pixtral", "PixtralVisionConfig"), ("plbart", "PLBartConfig"), ("poolformer", "PoolFormerConfig"), ("pop2piano", "Pop2PianoConfig"), @@ -214,6 +232,7 @@ ("qwen2_audio", "Qwen2AudioConfig"), ("qwen2_audio_encoder", "Qwen2AudioEncoderConfig"), ("qwen2_moe", "Qwen2MoeConfig"), + ("qwen2_vl", "Qwen2VLConfig"), ("rag", "RagConfig"), ("realm", "RealmConfig"), ("recurrent_gemma", "RecurrentGemmaConfig"), @@ -300,6 +319,7 @@ ("xmod", "XmodConfig"), ("yolos", "YolosConfig"), ("yoso", "YosoConfig"), + ("zamba", "ZambaConfig"), ("zoedepth", "ZoeDepthConfig"), ] ) @@ -311,6 +331,8 @@ ("albert", "ALBERT"), ("align", "ALIGN"), ("altclip", "AltCLIP"), + ("aria", "Aria"), + ("aria_text", "AriaText"), ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"), ("autoformer", "Autoformer"), ("bark", "Bark"), @@ -342,6 +364,7 @@ ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), ("clap", "CLAP"), ("clip", "CLIP"), + ("clip_text_model", "CLIPTextModel"), ("clip_vision_model", "CLIPVisionModel"), ("clipseg", "CLIPSeg"), ("clvp", "CLVP"), @@ -403,6 +426,7 @@ ("gemma", "Gemma"), ("gemma2", "Gemma2"), ("git", "GIT"), + ("glm", "GLM"), ("glpn", "GLPN"), ("gpt-sw3", "GPT-Sw3"), ("gpt2", "OpenAI GPT-2"), @@ -412,6 +436,8 @@ ("gpt_neox_japanese", "GPT NeoX Japanese"), ("gptj", "GPT-J"), ("gptsan-japanese", "GPTSAN-japanese"), + ("granite", "Granite"), + ("granitemoe", "GraniteMoeMoe"), ("graphormer", "Graphormer"), ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), @@ -421,6 +447,9 @@ ("ibert", "I-BERT"), ("idefics", "IDEFICS"), ("idefics2", "Idefics2"), + ("idefics3", "Idefics3"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "I-JEPA"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), @@ -442,6 +471,7 @@ ("llava", "LLaVa"), ("llava_next", "LLaVA-NeXT"), ("llava_next_video", "LLaVa-NeXT-Video"), + ("llava_onevision", "LLaVA-Onevision"), ("longformer", "Longformer"), ("longt5", "LongT5"), ("luke", "LUKE"), @@ -463,8 +493,10 @@ ("megatron-bert", "Megatron-BERT"), ("megatron_gpt2", "Megatron-GPT2"), ("mgp-str", "MGP-STR"), + ("mimi", "Mimi"), ("mistral", "Mistral"), ("mixtral", "Mixtral"), + ("mllama", "Mllama"), ("mluke", "mLUKE"), ("mms", "MMS"), ("mobilebert", "MobileBERT"), @@ -472,6 +504,7 @@ ("mobilenet_v2", "MobileNetV2"), ("mobilevit", "MobileViT"), ("mobilevitv2", "MobileViTV2"), + ("moshi", "Moshi"), ("mpnet", "MPNet"), ("mpt", "MPT"), ("mra", "MRA"), @@ -479,6 +512,7 @@ ("musicgen", "MusicGen"), ("musicgen_melody", "MusicGen Melody"), ("mvp", "MVP"), + ("myt5", "myt5"), ("nat", "NAT"), ("nemotron", "Nemotron"), ("nezha", "Nezha"), @@ -487,6 +521,9 @@ ("nougat", "Nougat"), ("nystromformer", "Nyströmformer"), ("olmo", "OLMo"), + ("olmo2", "OLMo2"), + ("olmoe", "OLMoE"), + ("omdet-turbo", "OmDet-Turbo"), ("oneformer", "OneFormer"), ("open-llama", "OpenLlama"), ("openai-gpt", "OpenAI GPT"), @@ -502,8 +539,10 @@ ("persimmon", "Persimmon"), ("phi", "Phi"), ("phi3", "Phi3"), + ("phimoe", "Phimoe"), ("phobert", "PhoBERT"), ("pix2struct", "Pix2Struct"), + ("pixtral", "Pixtral"), ("plbart", "PLBart"), ("poolformer", "PoolFormer"), ("pop2piano", "Pop2Piano"), @@ -515,6 +554,7 @@ ("qwen2_audio", "Qwen2Audio"), ("qwen2_audio_encoder", "Qwen2AudioEncoder"), ("qwen2_moe", "Qwen2MoE"), + ("qwen2_vl", "Qwen2VL"), ("rag", "RAG"), ("realm", "REALM"), ("recurrent_gemma", "RecurrentGemma"), @@ -608,6 +648,7 @@ ("xmod", "X-MOD"), ("yolos", "YOLOS"), ("yoso", "YOSO"), + ("zamba", "Zamba"), ("zoedepth", "ZoeDepth"), ] ) @@ -653,6 +694,9 @@ ("xclip", "x_clip"), ("clip_vision_model", "clip"), ("qwen2_audio_encoder", "qwen2_audio"), + ("clip_text_model", "clip"), + ("aria_text", "aria"), + ("idefics3_vision", "idefics3"), ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), @@ -1037,4 +1081,4 @@ def register(model_type, config, exist_ok=False): f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they " "match!" ) - CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) \ No newline at end of file + CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 1ef052f7bf3f8e..f7248b0f1e0d62 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertModel"), ("align", "AlignModel"), ("altclip", "AltCLIPModel"), + ("aria", "AriaForConditionalGeneration"), + ("aria_text", "AriaTextModel"), ("audio-spectrogram-transformer", "ASTModel"), ("autoformer", "AutoformerModel"), ("bark", "BarkModel"), @@ -60,6 +62,7 @@ ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), ("clap", "ClapModel"), ("clip", "CLIPModel"), + ("clip_text_model", "CLIPTextModel"), ("clip_vision_model", "CLIPVisionModel"), ("clipseg", "CLIPSegModel"), ("clvp", "ClvpModelForConditionalGeneration"), @@ -111,6 +114,7 @@ ("gemma", "GemmaModel"), ("gemma2", "Gemma2Model"), ("git", "GitModel"), + ("glm", "GlmModel"), ("glpn", "GLPNModel"), ("gpt-sw3", "GPT2Model"), ("gpt2", "GPT2Model"), @@ -120,6 +124,8 @@ ("gpt_neox_japanese", "GPTNeoXJapaneseModel"), ("gptj", "GPTJModel"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), + ("granite", "GraniteModel"), + ("granitemoe", "GraniteMoeModel"), ("graphormer", "GraphormerModel"), ("grounding-dino", "GroundingDinoModel"), ("groupvit", "GroupViTModel"), @@ -128,6 +134,9 @@ ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), ("idefics2", "Idefics2Model"), + ("idefics3", "Idefics3Model"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "IJepaModel"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), ("jamba", "JambaModel"), @@ -158,6 +167,7 @@ ("mega", "MegaModel"), ("megatron-bert", "MegatronBertModel"), ("mgp-str", "MgpstrForSceneTextRecognition"), + ("mimi", "MimiModel"), ("mistral", "MistralModel"), ("mixtral", "MixtralModel"), ("mobilebert", "MobileBertModel"), @@ -165,6 +175,7 @@ ("mobilenet_v2", "MobileNetV2Model"), ("mobilevit", "MobileViTModel"), ("mobilevitv2", "MobileViTV2Model"), + ("moshi", "MoshiModel"), ("mpnet", "MPNetModel"), ("mpt", "MptModel"), ("mra", "MraModel"), @@ -178,6 +189,9 @@ ("nllb-moe", "NllbMoeModel"), ("nystromformer", "NystromformerModel"), ("olmo", "OlmoModel"), + ("olmo2", "Olmo2Model"), + ("olmoe", "OlmoeModel"), + ("omdet-turbo", "OmDetTurboForObjectDetection"), ("oneformer", "OneFormerModel"), ("open-llama", "OpenLlamaModel"), ("openai-gpt", "OpenAIGPTModel"), @@ -192,6 +206,8 @@ ("persimmon", "PersimmonModel"), ("phi", "PhiModel"), ("phi3", "Phi3Model"), + ("phimoe", "PhimoeModel"), + ("pixtral", "PixtralVisionModel"), ("plbart", "PLBartModel"), ("poolformer", "PoolFormerModel"), ("prophetnet", "ProphetNetModel"), @@ -201,6 +217,7 @@ ("qwen2", "Qwen2Model"), ("qwen2_audio_encoder", "Qwen2AudioEncoder"), ("qwen2_moe", "Qwen2MoeModel"), + ("qwen2_vl", "Qwen2VLModel"), ("recurrent_gemma", "RecurrentGemmaModel"), ("reformer", "ReformerModel"), ("regnet", "RegNetModel"), @@ -275,6 +292,7 @@ ("xmod", "XmodModel"), ("yolos", "YolosModel"), ("yoso", "YosoModel"), + ("zamba", "ZambaModel"), ] ) @@ -308,10 +326,12 @@ ("ibert", "IBertForMaskedLM"), ("idefics", "IdeficsForVisionText2Text"), ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), ("longformer", "LongformerForMaskedLM"), ("luke", "LukeForMaskedLM"), ("lxmert", "LxmertForPreTraining"), @@ -319,6 +339,7 @@ ("mamba2", "Mamba2ForCausalLM"), ("mega", "MegaForMaskedLM"), ("megatron-bert", "MegatronBertForPreTraining"), + ("mllama", "MllamaForConditionalGeneration"), ("mobilebert", "MobileBertForPreTraining"), ("mpnet", "MPNetForMaskedLM"), ("mpt", "MptForCausalLM"), @@ -447,6 +468,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( [ # Model for Causal LM mapping + ("aria_text", "AriaTextForCausalLM"), ("bart", "BartForCausalLM"), ("bert", "BertLMHeadModel"), ("bert-generation", "BertGenerationDecoder"), @@ -472,6 +494,7 @@ ("gemma", "GemmaForCausalLM"), ("gemma2", "Gemma2ForCausalLM"), ("git", "GitForCausalLM"), + ("glm", "GlmForCausalLM"), ("gpt-sw3", "GPT2LMHeadModel"), ("gpt2", "GPT2LMHeadModel"), ("gpt_bigcode", "GPTBigCodeForCausalLM"), @@ -479,6 +502,8 @@ ("gpt_neox", "GPTNeoXForCausalLM"), ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"), ("gptj", "GPTJForCausalLM"), + ("granite", "GraniteForCausalLM"), + ("granitemoe", "GraniteMoeForCausalLM"), ("jamba", "JambaForCausalLM"), ("jetmoe", "JetMoeForCausalLM"), ("llama", "LlamaForCausalLM"), @@ -490,12 +515,16 @@ ("megatron-bert", "MegatronBertForCausalLM"), ("mistral", "MistralForCausalLM"), ("mixtral", "MixtralForCausalLM"), + ("mllama", "MllamaForCausalLM"), + ("moshi", "MoshiForCausalLM"), ("mpt", "MptForCausalLM"), ("musicgen", "MusicgenForCausalLM"), ("musicgen_melody", "MusicgenMelodyForCausalLM"), ("mvp", "MvpForCausalLM"), ("nemotron", "NemotronForCausalLM"), ("olmo", "OlmoForCausalLM"), + ("olmo2", "Olmo2ForCausalLM"), + ("olmoe", "OlmoeForCausalLM"), ("open-llama", "OpenLlamaForCausalLM"), ("openai-gpt", "OpenAIGPTLMHeadModel"), ("opt", "OPTForCausalLM"), @@ -503,6 +532,7 @@ ("persimmon", "PersimmonForCausalLM"), ("phi", "PhiForCausalLM"), ("phi3", "Phi3ForCausalLM"), + ("phimoe", "PhimoeForCausalLM"), ("plbart", "PLBartForCausalLM"), ("prophetnet", "ProphetNetForCausalLM"), ("qdqbert", "QDQBertLMHeadModel"), @@ -529,6 +559,7 @@ ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"), ("xlnet", "XLNetLMHeadModel"), ("xmod", "XmodForCausalLM"), + ("zamba", "ZambaForCausalLM"), ] ) @@ -554,8 +585,10 @@ ("focalnet", "FocalNetModel"), ("glpn", "GLPNModel"), ("hiera", "HieraModel"), + ("ijepa", "IJepaModel"), ("imagegpt", "ImageGPTModel"), ("levit", "LevitModel"), + ("mllama", "MllamaVisionModel"), ("mobilenet_v1", "MobileNetV1Model"), ("mobilenet_v2", "MobileNetV2Model"), ("mobilevit", "MobileViTModel"), @@ -620,7 +653,6 @@ ), ("dinat", "DinatForImageClassification"), ("dinov2", "Dinov2ForImageClassification"), - ("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"), ( "efficientformer", ( @@ -628,9 +660,11 @@ "EfficientFormerForImageClassificationWithTeacher", ), ), + ("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"), ("efficientnet", "EfficientNetForImageClassification"), ("focalnet", "FocalNetForImageClassification"), ("hiera", "HieraForImageClassification"), + ("ijepa", "IJepaForImageClassification"), ("imagegpt", "ImageGPTForImageClassification"), ( "levit", @@ -721,20 +755,51 @@ ("chameleon", "ChameleonForConditionalGeneration"), ("git", "GitForCausalLM"), ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), ("paligemma", "PaliGemmaForConditionalGeneration"), ("pix2struct", "Pix2StructForConditionalGeneration"), + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), ("video_llava", "VideoLlavaForConditionalGeneration"), ("vipllava", "VipLlavaForConditionalGeneration"), ("vision-encoder-decoder", "VisionEncoderDecoderModel"), ] ) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( + [ + ("aria", "AriaForConditionalGeneration"), + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("chameleon", "ChameleonForConditionalGeneration"), + ("fuyu", "FuyuForCausalLM"), + ("git", "GitForCausalLM"), + ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("pixtral", "LlavaForConditionalGeneration"), + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( [ # Model for Masked LM mapping @@ -803,6 +868,7 @@ [ # Model for Zero Shot Object Detection mapping ("grounding-dino", "GroundingDinoForObjectDetection"), + ("omdet-turbo", "OmDetTurboForObjectDetection"), ("owlv2", "Owlv2ForObjectDetection"), ("owlvit", "OwlViTForObjectDetection"), ] @@ -890,6 +956,7 @@ ("funnel", "FunnelForSequenceClassification"), ("gemma", "GemmaForSequenceClassification"), ("gemma2", "Gemma2ForSequenceClassification"), + ("glm", "GlmForSequenceClassification"), ("gpt-sw3", "GPT2ForSequenceClassification"), ("gpt2", "GPT2ForSequenceClassification"), ("gpt_bigcode", "GPTBigCodeForSequenceClassification"), @@ -929,6 +996,7 @@ ("persimmon", "PersimmonForSequenceClassification"), ("phi", "PhiForSequenceClassification"), ("phi3", "Phi3ForSequenceClassification"), + ("phimoe", "PhimoeForSequenceClassification"), ("plbart", "PLBartForSequenceClassification"), ("qdqbert", "QDQBertForSequenceClassification"), ("qwen2", "Qwen2ForSequenceClassification"), @@ -952,6 +1020,7 @@ ("xlnet", "XLNetForSequenceClassification"), ("xmod", "XmodForSequenceClassification"), ("yoso", "YosoForSequenceClassification"), + ("zamba", "ZambaForSequenceClassification"), ] ) @@ -995,6 +1064,8 @@ ("mbart", "MBartForQuestionAnswering"), ("mega", "MegaForQuestionAnswering"), ("megatron-bert", "MegatronBertForQuestionAnswering"), + ("mistral", "MistralForQuestionAnswering"), + ("mixtral", "MixtralForQuestionAnswering"), ("mobilebert", "MobileBertForQuestionAnswering"), ("mpnet", "MPNetForQuestionAnswering"), ("mpt", "MptForQuestionAnswering"), @@ -1006,6 +1077,8 @@ ("nystromformer", "NystromformerForQuestionAnswering"), ("opt", "OPTForQuestionAnswering"), ("qdqbert", "QDQBertForQuestionAnswering"), + ("qwen2", "Qwen2ForQuestionAnswering"), + ("qwen2_moe", "Qwen2MoeForQuestionAnswering"), ("reformer", "ReformerForQuestionAnswering"), ("rembert", "RemBertForQuestionAnswering"), ("roberta", "RobertaForQuestionAnswering"), @@ -1074,6 +1147,7 @@ ("funnel", "FunnelForTokenClassification"), ("gemma", "GemmaForTokenClassification"), ("gemma2", "Gemma2ForTokenClassification"), + ("glm", "GlmForTokenClassification"), ("gpt-sw3", "GPT2ForTokenClassification"), ("gpt2", "GPT2ForTokenClassification"), ("gpt_bigcode", "GPTBigCodeForTokenClassification"), @@ -1267,6 +1341,7 @@ ("align", "AlignModel"), ("altclip", "AltCLIPModel"), ("blip", "BlipModel"), + ("blip-2", "Blip2ForImageTextRetrieval"), ("chinese_clip", "ChineseCLIPModel"), ("clip", "CLIPModel"), ("clipseg", "CLIPSegModel"), @@ -1317,6 +1392,7 @@ ("albert", "AlbertModel"), ("bert", "BertModel"), ("big_bird", "BigBirdModel"), + ("clip_text_model", "CLIPTextModel"), ("data2vec-text", "Data2VecTextModel"), ("deberta", "DebertaModel"), ("deberta-v2", "DebertaV2Model"), @@ -1325,6 +1401,7 @@ ("flaubert", "FlaubertModel"), ("ibert", "IBertModel"), ("longformer", "LongformerModel"), + ("mllama", "MllamaTextModel"), ("mobilebert", "MobileBertModel"), ("mt5", "MT5EncoderModel"), ("nystromformer", "NystromformerModel"), @@ -1392,6 +1469,9 @@ CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES ) MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES +) MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES ) @@ -1686,6 +1766,13 @@ class AutoModelForVision2Seq(_BaseAutoModelClass): AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling") +class AutoModelForImageTextToText(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + +AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling") + + class AutoModelForAudioClassification(_BaseAutoModelClass): _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING @@ -1763,4 +1850,4 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): "`AutoModelForSeq2SeqLM` for encoder-decoder models.", FutureWarning, ) - return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) \ No newline at end of file + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py index 7ea58d19ea894d..09c3a5ead130d9 100644 --- a/src/transformers/models/dinov2/modeling_dinov2.py +++ b/src/transformers/models/dinov2/modeling_dinov2.py @@ -38,6 +38,7 @@ add_start_docstrings_to_model_forward, logging, replace_return_docstrings, + torch_int, ) from ...utils.backbone_utils import BackboneMixin from .configuration_dinov2 import Dinov2Config @@ -71,42 +72,48 @@ def __init__(self, config: Dinov2Config) -> None: num_patches = self.patch_embeddings.num_patches self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size)) self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.patch_size = config.patch_size self.config = config def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: """ - This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher - resolution images. + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing and interpolation at torch.float32 precision. - Source: - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174 + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 """ num_patches = embeddings.shape[1] - 1 num_positions = self.position_embeddings.shape[1] - 1 - if num_patches == num_positions and height == width: + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: return self.position_embeddings - class_pos_embed = self.position_embeddings[:, 0] + + class_pos_embed = self.position_embeddings[:, :1] patch_pos_embed = self.position_embeddings[:, 1:] + dim = embeddings.shape[-1] - height = height // self.config.patch_size - width = width // self.config.patch_size - # we add a small number to avoid floating point error in the interpolation - # see discussion at https://github.com/facebookresearch/dino/issues/8 - height, width = height + 0.1, width + 0.1 - patch_pos_embed = patch_pos_embed.reshape(1, int(math.sqrt(num_positions)), int(math.sqrt(num_positions)), dim) + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) target_dtype = patch_pos_embed.dtype patch_pos_embed = nn.functional.interpolate( - patch_pos_embed.to(dtype=torch.float32), - scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))), + patch_pos_embed.to(torch.float32), + size=(new_height, new_width), mode="bicubic", align_corners=False, ).to(dtype=target_dtype) - if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]: - raise ValueError("Width or height does not match with the interpolated position embeddings") + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) - return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor: batch_size, _, height, width = pixel_values.shape @@ -224,6 +231,47 @@ def forward( return outputs +class Dinov2SdpaSelfAttention(Dinov2SelfAttention): + def __init__(self, config: Dinov2Config) -> None: + super().__init__(config) + self.attention_probs_dropout_prob = config.attention_probs_dropout_prob + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "Dinov2Model is using Dinov2SdpaSelfAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, head_mask=head_mask, output_attentions=output_attentions + ) + + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + head_mask, + self.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=None, + ) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + return context_layer, None + + # Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->Dinov2 class Dinov2SelfOutput(nn.Module): """ @@ -283,6 +331,13 @@ def forward( return outputs +# Copied from transformers.models.vit.modeling_vit.ViTSdpaAttention with ViT->Dinov2 +class Dinov2SdpaAttention(Dinov2Attention): + def __init__(self, config: Dinov2Config) -> None: + super().__init__(config) + self.attention = Dinov2SdpaSelfAttention(config) + + class Dinov2LayerScale(nn.Module): def __init__(self, config) -> None: super().__init__() @@ -302,7 +357,6 @@ def __init__(self, drop_prob: Optional[float] = None) -> None: def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: """ Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). - Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the @@ -361,6 +415,12 @@ def forward(self, hidden_state: torch.Tensor) -> torch.Tensor: return self.weights_out(hidden) +DINOV2_ATTENTION_CLASSES = { + "eager": Dinov2Attention, + "sdpa": Dinov2SdpaAttention, +} + + class Dinov2Layer(nn.Module): """This corresponds to the Block class in the original implementation.""" @@ -368,7 +428,7 @@ def __init__(self, config: Dinov2Config) -> None: super().__init__() self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.attention = Dinov2Attention(config) + self.attention = DINOV2_ATTENTION_CLASSES[config._attn_implementation](config) self.layer_scale1 = Dinov2LayerScale(config) self.drop_path = Dinov2DropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity() @@ -475,6 +535,7 @@ class Dinov2PreTrainedModel(PreTrainedModel): main_input_name = "pixel_values" supports_gradient_checkpointing = True _no_split_modules = ["Dinov2SwiGLUFFN"] + _supports_sdpa = True def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: """Initialize the weights""" @@ -847,4 +908,4 @@ def forward( feature_maps=feature_maps, hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions if output_attentions else None, - ) \ No newline at end of file + ) diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 34c767cf2b0429..c0adae0756f923 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -72,6 +72,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class OffloadedStaticCache(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class QuantizedCache(metaclass=DummyObject): _backends = ["torch"] @@ -184,6 +191,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class BayesianDetectorConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class BayesianDetectorModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class BeamScorer(metaclass=DummyObject): _backends = ["torch"] @@ -289,13 +310,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ForceTokensLogitsProcessor(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class GenerationMixin(metaclass=DummyObject): _backends = ["torch"] @@ -457,6 +471,27 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class SynthIDTextWatermarkDetector(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkingConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class SynthIDTextWatermarkLogitsProcessor(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class TemperatureLogitsWarper(metaclass=DummyObject): _backends = ["torch"] @@ -513,6 +548,17 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class TorchExportableModuleWithStaticCache(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +def convert_and_export_with_cache(*args, **kwargs): + requires_backends(convert_and_export_with_cache, ["torch"]) + + ROPE_INIT_FUNCTIONS = None @@ -639,6 +685,41 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AriaForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class ASTForAudioClassification(metaclass=DummyObject): _backends = ["torch"] @@ -696,6 +777,9 @@ def __init__(self, *args, **kwargs): MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = None +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = None + + MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = None @@ -863,6 +947,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AutoModelForImageTextToText(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class AutoModelForImageToImage(metaclass=DummyObject): _backends = ["torch"] @@ -1262,13 +1353,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class BertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class BertLMHeadModel(metaclass=DummyObject): _backends = ["torch"] @@ -1368,13 +1452,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class BigBirdLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class BigBirdModel(metaclass=DummyObject): _backends = ["torch"] @@ -1610,6 +1687,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2ForImageTextRetrieval(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Blip2Model(metaclass=DummyObject): _backends = ["torch"] @@ -1631,6 +1715,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2TextModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Blip2VisionModel(metaclass=DummyObject): _backends = ["torch"] @@ -1638,6 +1729,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Blip2VisionModelWithProjection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class BloomForCausalLM(metaclass=DummyObject): _backends = ["torch"] @@ -1841,13 +1939,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class CanineLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class CanineModel(metaclass=DummyObject): _backends = ["torch"] @@ -2209,13 +2300,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ConvBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class ConvBertModel(metaclass=DummyObject): _backends = ["torch"] @@ -3123,13 +3207,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class QDQBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class QDQBertLMHeadModel(metaclass=DummyObject): _backends = ["torch"] @@ -4140,13 +4217,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class FNetLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class FNetModel(metaclass=DummyObject): _backends = ["torch"] @@ -4396,6 +4466,41 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class GlmForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmForTokenClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GlmPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class GLPNForDepthEstimation(metaclass=DummyObject): _backends = ["torch"] @@ -4579,84 +4684,112 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXLayer(metaclass=DummyObject): +class GPTNeoXModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXModel(metaclass=DummyObject): +class GPTNeoXPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXPreTrainedModel(metaclass=DummyObject): +class GPTNeoXJapaneseForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXJapaneseForCausalLM(metaclass=DummyObject): +class GPTNeoXJapaneseModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXJapaneseLayer(metaclass=DummyObject): +class GPTNeoXJapanesePreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXJapaneseModel(metaclass=DummyObject): +class GPTJForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTNeoXJapanesePreTrainedModel(metaclass=DummyObject): +class GPTJForQuestionAnswering(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTJForCausalLM(metaclass=DummyObject): +class GPTJForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTJForQuestionAnswering(metaclass=DummyObject): +class GPTJModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTJForSequenceClassification(metaclass=DummyObject): +class GPTJPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTJModel(metaclass=DummyObject): +class GraniteForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class GPTJPreTrainedModel(metaclass=DummyObject): +class GraniteModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GranitePreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GraniteMoeForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GraniteMoeModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GraniteMoePreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -4880,151 +5013,214 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ImageGPTForCausalImageModeling(metaclass=DummyObject): +class Idefics3ForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ImageGPTForImageClassification(metaclass=DummyObject): +class Idefics3Model(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ImageGPTModel(metaclass=DummyObject): +class Idefics3PreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ImageGPTPreTrainedModel(metaclass=DummyObject): +class Idefics3Processor(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -def load_tf_weights_in_imagegpt(*args, **kwargs): - requires_backends(load_tf_weights_in_imagegpt, ["torch"]) - - -class InformerForPrediction(metaclass=DummyObject): +class Idefics3VisionConfig(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InformerModel(metaclass=DummyObject): +class Idefics3VisionTransformer(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InformerPreTrainedModel(metaclass=DummyObject): +class IJepaForImageClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipForConditionalGeneration(metaclass=DummyObject): +class IJepaModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipPreTrainedModel(metaclass=DummyObject): +class IJepaPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipQFormerModel(metaclass=DummyObject): +class ImageGPTForCausalImageModeling(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipVisionModel(metaclass=DummyObject): +class ImageGPTForImageClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject): +class ImageGPTModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipVideoPreTrainedModel(metaclass=DummyObject): +class ImageGPTPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipVideoQFormerModel(metaclass=DummyObject): +def load_tf_weights_in_imagegpt(*args, **kwargs): + requires_backends(load_tf_weights_in_imagegpt, ["torch"]) + + +class InformerForPrediction(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class InstructBlipVideoVisionModel(metaclass=DummyObject): +class InformerModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaForCausalLM(metaclass=DummyObject): +class InformerPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaForSequenceClassification(metaclass=DummyObject): +class InstructBlipForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaModel(metaclass=DummyObject): +class InstructBlipPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JambaPreTrainedModel(metaclass=DummyObject): +class InstructBlipQFormerModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoeForCausalLM(metaclass=DummyObject): +class InstructBlipVisionModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class JetMoeForSequenceClassification(metaclass=DummyObject): +class InstructBlipVideoForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoQFormerModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class InstructBlipVideoVisionModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JambaForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JambaForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JambaModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JambaPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JetMoeForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class JetMoeForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -5360,6 +5556,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class LlavaOnevisionForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class LlavaOnevisionPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class LongformerForMaskedLM(metaclass=DummyObject): _backends = ["torch"] @@ -5409,13 +5619,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class LongformerSelfAttention(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class LongT5EncoderModel(metaclass=DummyObject): _backends = ["torch"] @@ -5556,13 +5759,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class LxmertXLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class M2M100ForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] @@ -5647,6 +5843,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MarianPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MarkupLMForQuestionAnswering(metaclass=DummyObject): _backends = ["torch"] @@ -5864,6 +6067,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MimiModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MimiPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MistralForCausalLM(metaclass=DummyObject): _backends = ["torch"] @@ -5871,6 +6088,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MistralForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MistralForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -5906,6 +6130,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MixtralForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MixtralForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -5934,6 +6165,48 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class MllamaForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MllamaForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MllamaPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MllamaProcessor(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MllamaTextModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MllamaVisionModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class MobileBertForMaskedLM(metaclass=DummyObject): _backends = ["torch"] @@ -5983,13 +6256,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MobileBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class MobileBertModel(metaclass=DummyObject): _backends = ["torch"] @@ -6121,42 +6387,63 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetForMaskedLM(metaclass=DummyObject): +class MoshiForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetForMultipleChoice(metaclass=DummyObject): +class MoshiForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetForQuestionAnswering(metaclass=DummyObject): +class MoshiModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetForSequenceClassification(metaclass=DummyObject): +class MoshiPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetForTokenClassification(metaclass=DummyObject): +class MPNetForMaskedLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MPNetForMultipleChoice(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class MPNetLayer(metaclass=DummyObject): +class MPNetForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MPNetForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class MPNetForTokenClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -6534,42 +6821,91 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class NystromformerLayer(metaclass=DummyObject): +class NystromformerModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class NystromformerModel(metaclass=DummyObject): +class NystromformerPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class NystromformerPreTrainedModel(metaclass=DummyObject): +class OlmoForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class OlmoForCausalLM(metaclass=DummyObject): +class OlmoModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class OlmoModel(metaclass=DummyObject): +class OlmoPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class OlmoPreTrainedModel(metaclass=DummyObject): +class Olmo2ForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Olmo2Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Olmo2PreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class OlmoeForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class OlmoeModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class OlmoePreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class OmDetTurboForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class OmDetTurboPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -6944,13 +7280,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class PerceiverLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class PerceiverModel(metaclass=DummyObject): _backends = ["torch"] @@ -7070,6 +7399,34 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class PhimoeForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PhimoeForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PhimoeModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PhimoePreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Pix2StructForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] @@ -7098,6 +7455,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class PixtralPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class PixtralVisionModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class PLBartForCausalLM(metaclass=DummyObject): _backends = ["torch"] @@ -7266,6 +7637,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Qwen2ForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Qwen2ForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -7322,6 +7700,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Qwen2MoeForQuestionAnswering(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class Qwen2MoeForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] @@ -7350,84 +7735,91 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RagModel(metaclass=DummyObject): +class Qwen2VLForConditionalGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RagPreTrainedModel(metaclass=DummyObject): +class Qwen2VLModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RagSequenceForGeneration(metaclass=DummyObject): +class Qwen2VLPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RagTokenForGeneration(metaclass=DummyObject): +class RagModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RecurrentGemmaForCausalLM(metaclass=DummyObject): +class RagPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RecurrentGemmaModel(metaclass=DummyObject): +class RagSequenceForGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RecurrentGemmaPreTrainedModel(metaclass=DummyObject): +class RagTokenForGeneration(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ReformerAttention(metaclass=DummyObject): +class RecurrentGemmaForCausalLM(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ReformerForMaskedLM(metaclass=DummyObject): +class RecurrentGemmaModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ReformerForQuestionAnswering(metaclass=DummyObject): +class RecurrentGemmaPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ReformerForSequenceClassification(metaclass=DummyObject): +class ReformerForMaskedLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ReformerForQuestionAnswering(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ReformerLayer(metaclass=DummyObject): +class ReformerForSequenceClassification(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -7518,13 +7910,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RemBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class RemBertModel(metaclass=DummyObject): _backends = ["torch"] @@ -7732,13 +8117,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RoCBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class RoCBertModel(metaclass=DummyObject): _backends = ["torch"] @@ -7799,13 +8177,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class RoFormerLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class RoFormerModel(metaclass=DummyObject): _backends = ["torch"] @@ -8027,13 +8398,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class SegformerLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class SegformerModel(metaclass=DummyObject): _backends = ["torch"] @@ -8244,13 +8608,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class SplinterLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class SplinterModel(metaclass=DummyObject): _backends = ["torch"] @@ -8307,13 +8664,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class SqueezeBertModule(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class SqueezeBertPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] @@ -9022,13 +9372,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ViltLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class ViltModel(metaclass=DummyObject): _backends = ["torch"] @@ -9106,13 +9449,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class VisualBertLayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class VisualBertModel(metaclass=DummyObject): _backends = ["torch"] @@ -9162,13 +9498,6 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class ViTMAELayer(metaclass=DummyObject): - _backends = ["torch"] - - def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) - - class ViTMAEModel(metaclass=DummyObject): _backends = ["torch"] @@ -9887,21 +10216,42 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class YosoLayer(metaclass=DummyObject): +class YosoModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class YosoModel(metaclass=DummyObject): +class YosoPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) -class YosoPreTrainedModel(metaclass=DummyObject): +class ZambaForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ZambaForSequenceClassification(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ZambaModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class ZambaPreTrainedModel(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): @@ -10002,4 +10352,4 @@ class Seq2SeqTrainer(metaclass=DummyObject): _backends = ["torch"] def __init__(self, *args, **kwargs): - requires_backends(self, ["torch"]) \ No newline at end of file + requires_backends(self, ["torch"]) diff --git a/utils/check_repo.py b/utils/check_repo.py index cafa2e42bd3563..130eebf0b83801 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -31,7 +31,6 @@ It has no auto-fix mode. """ -import inspect import os import re import sys @@ -70,6 +69,7 @@ "UMT5Stack", "Pop2PianoStack", "Qwen2AudioEncoder", + "Qwen2VisionTransformerPretrainedModel", "SwitchTransformersStack", "TFDPRSpanPredictor", "MaskFormerSwinModel", @@ -82,54 +82,65 @@ "SeamlessM4Tv2TextToUnitModel", "SeamlessM4Tv2CodeHifiGan", "SeamlessM4Tv2TextToUnitForConditionalGeneration", + "Idefics2PerceiverResampler", + "Idefics2VisionTransformer", + "Idefics3VisionTransformer", + "AriaTextForCausalLM", + "AriaTextModel", ] # Update this list for models that are not tested with a comment explaining the reason it should not be. # Being in this list is an exception and should **not** be the rule. -IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ - # models to ignore for not tested - "RecurrentGemmaModel", # Building part of bigger (tested) model. - "FuyuForCausalLM", # Not tested fort now - "InstructBlipQFormerModel", # Building part of bigger (tested) model. - "InstructBlipVideoQFormerModel", # Building part of bigger (tested) model. - "UMT5EncoderModel", # Building part of bigger (tested) model. - "Blip2QFormerModel", # Building part of bigger (tested) model. - "ErnieMForInformationExtraction", - "FastSpeech2ConformerHifiGan", # Already tested by SpeechT5HifiGan (# Copied from) - "FastSpeech2ConformerWithHifiGan", # Built with two smaller (tested) models. - "GraphormerDecoderHead", # Building part of bigger (tested) model. - "JukeboxVQVAE", # Building part of bigger (tested) model. - "JukeboxPrior", # Building part of bigger (tested) model. - "DecisionTransformerGPT2Model", # Building part of bigger (tested) model. - "SegformerDecodeHead", # Building part of bigger (tested) model. - "MgpstrModel", # Building part of bigger (tested) model. - "BertLMHeadModel", # Needs to be setup as decoder. - "MegatronBertLMHeadModel", # Building part of bigger (tested) model. - "RealmBertModel", # Building part of bigger (tested) model. - "RealmReader", # Not regular model. - "RealmScorer", # Not regular model. - "RealmForOpenQA", # Not regular model. - "ReformerForMaskedLM", # Needs to be setup as decoder. - "TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?) - "TFRobertaForMultipleChoice", # TODO: fix - "TFRobertaPreLayerNormForMultipleChoice", # TODO: fix - "SeparableConv1D", # Building part of bigger (tested) model. - "FlaxBartForCausalLM", # Building part of bigger (tested) model. - "FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM. - "OPTDecoderWrapper", - "TFSegformerDecodeHead", # Not a regular model. - "AltRobertaModel", # Building part of bigger (tested) model. - "BlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models - "TFBlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models - "BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model. - "BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model. - "BarkCausalModel", # Building part of bigger (tested) model. - "BarkModel", # Does not have a forward signature - generation tested with integration tests. - "SeamlessM4TTextToUnitModel", # Building part of bigger (tested) model. - "SeamlessM4TCodeHifiGan", # Building part of bigger (tested) model. - "SeamlessM4TTextToUnitForConditionalGeneration", # Building part of bigger (tested) model. - "ChameleonVQVAE", # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model -] +IGNORE_NON_TESTED = ( + PRIVATE_MODELS.copy() + + [ + # models to ignore for not tested + "RecurrentGemmaModel", # Building part of bigger (tested) model. + "FuyuForCausalLM", # Not tested fort now + "InstructBlipQFormerModel", # Building part of bigger (tested) model. + "InstructBlipVideoQFormerModel", # Building part of bigger (tested) model. + "UMT5EncoderModel", # Building part of bigger (tested) model. + "Blip2QFormerModel", # Building part of bigger (tested) model. + "ErnieMForInformationExtraction", + "FastSpeech2ConformerHifiGan", # Already tested by SpeechT5HifiGan (# Copied from) + "FastSpeech2ConformerWithHifiGan", # Built with two smaller (tested) models. + "GraphormerDecoderHead", # Building part of bigger (tested) model. + "JukeboxVQVAE", # Building part of bigger (tested) model. + "JukeboxPrior", # Building part of bigger (tested) model. + "DecisionTransformerGPT2Model", # Building part of bigger (tested) model. + "SegformerDecodeHead", # Building part of bigger (tested) model. + "MgpstrModel", # Building part of bigger (tested) model. + "BertLMHeadModel", # Needs to be setup as decoder. + "MegatronBertLMHeadModel", # Building part of bigger (tested) model. + "RealmBertModel", # Building part of bigger (tested) model. + "RealmReader", # Not regular model. + "RealmScorer", # Not regular model. + "RealmForOpenQA", # Not regular model. + "ReformerForMaskedLM", # Needs to be setup as decoder. + "TFElectraMainLayer", # Building part of bigger (tested) model (should it be a TFPreTrainedModel ?) + "TFRobertaForMultipleChoice", # TODO: fix + "TFRobertaPreLayerNormForMultipleChoice", # TODO: fix + "SeparableConv1D", # Building part of bigger (tested) model. + "FlaxBartForCausalLM", # Building part of bigger (tested) model. + "FlaxBertForCausalLM", # Building part of bigger (tested) model. Tested implicitly through FlaxRobertaForCausalLM. + "OPTDecoderWrapper", + "TFSegformerDecodeHead", # Not a regular model. + "AltRobertaModel", # Building part of bigger (tested) model. + "BlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models + "TFBlipTextLMHeadModel", # No need to test it as it is tested by BlipTextVision models + "BridgeTowerTextModel", # No need to test it as it is tested by BridgeTowerModel model. + "BridgeTowerVisionModel", # No need to test it as it is tested by BridgeTowerModel model. + "BarkCausalModel", # Building part of bigger (tested) model. + "BarkModel", # Does not have a forward signature - generation tested with integration tests. + "SeamlessM4TTextToUnitModel", # Building part of bigger (tested) model. + "SeamlessM4TCodeHifiGan", # Building part of bigger (tested) model. + "SeamlessM4TTextToUnitForConditionalGeneration", # Building part of bigger (tested) model. + "ChameleonVQVAE", # VQVAE here is used only for encoding (discretizing) and is tested as part of bigger model + "Qwen2VLModel", # Building part of bigger (tested) model. Tested implicitly through Qwen2VLForConditionalGeneration. + "MllamaTextModel", # Building part of bigger (tested) model. # TODO: add tests + "MllamaVisionModel", # Building part of bigger (tested) model. # TODO: add tests + ] +) # Update this list with test files that don't have a tester with a `all_model_classes` variable and which don't # trigger the common tests. @@ -163,7 +174,8 @@ "ClapTextModelWithProjection", "ClapAudioModel", "ClapAudioModelWithProjection", - "Blip2ForConditionalGeneration", + "Blip2TextModelWithProjection", + "Blip2VisionModelWithProjection", "Blip2QFormerModel", "Blip2VisionModel", "ErnieMForInformationExtraction", @@ -172,7 +184,6 @@ "GitVisionModel", "GraphormerModel", "GraphormerForGraphClassification", - "BlipForConditionalGeneration", "BlipForImageTextRetrieval", "BlipForQuestionAnswering", "BlipVisionModel", @@ -218,7 +229,6 @@ "BeitForMaskedImageModeling", "ChineseCLIPTextModel", "ChineseCLIPVisionModel", - "CLIPTextModel", "CLIPTextModelWithProjection", "CLIPVisionModelWithProjection", "ClvpForCausalLM", @@ -236,7 +246,6 @@ "DetrForSegmentation", "Pix2StructVisionModel", "Pix2StructTextModel", - "Pix2StructForConditionalGeneration", "ConditionalDetrForSegmentation", "DPRReader", "FlaubertForQuestionAnswering", @@ -313,7 +322,6 @@ "SeamlessM4TCodeHifiGan", "SeamlessM4TForSpeechToSpeech", # no auto class for speech-to-speech "TvpForVideoGrounding", - "UdopForConditionalGeneration", "SeamlessM4Tv2NARTextToUnitModel", "SeamlessM4Tv2NARTextToUnitForConditionalGeneration", "SeamlessM4Tv2CodeHifiGan", @@ -322,6 +330,8 @@ "SiglipVisionModel", "SiglipTextModel", "ChameleonVQVAE", # no autoclass for VQ-VAE models + "CLIPTextModel", + "MoshiForConditionalGeneration", # no auto class for speech-to-speech ] # DO NOT edit this list! @@ -414,22 +424,15 @@ def get_model_modules() -> List[str]: "modeling_auto", "modeling_encoder_decoder", "modeling_marian", - "modeling_mmbt", - "modeling_outputs", "modeling_retribert", - "modeling_utils", "modeling_flax_auto", "modeling_flax_encoder_decoder", - "modeling_flax_utils", "modeling_speech_encoder_decoder", "modeling_flax_speech_encoder_decoder", "modeling_flax_vision_encoder_decoder", "modeling_timm_backbone", "modeling_tf_auto", "modeling_tf_encoder_decoder", - "modeling_tf_outputs", - "modeling_tf_pytorch_utils", - "modeling_tf_utils", "modeling_tf_vision_encoder_decoder", "modeling_vision_encoder_decoder", ] @@ -443,8 +446,7 @@ def get_model_modules() -> List[str]: for submodule in dir(model_module): if submodule.startswith("modeling") and submodule not in _ignore_modules: modeling_module = getattr(model_module, submodule) - if inspect.ismodule(modeling_module): - modules.append(modeling_module) + modules.append(modeling_module) return modules @@ -906,19 +908,26 @@ def find_all_documented_objects() -> List[str]: Returns: `List[str]`: The list of all object names being documented. + `Dict[str, List[str]]`: A dictionary mapping the object name (full import path, e.g. + `integrations.PeftAdapterMixin`) to its documented methods """ documented_obj = [] - for doc_file in Path(PATH_TO_DOC).glob("**/*.rst"): - with open(doc_file, "r", encoding="utf-8", newline="\n") as f: - content = f.read() - raw_doc_objs = re.findall(r"(?:autoclass|autofunction):: transformers.(\S+)\s+", content) - documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs] + documented_methods_map = {} for doc_file in Path(PATH_TO_DOC).glob("**/*.md"): with open(doc_file, "r", encoding="utf-8", newline="\n") as f: content = f.read() raw_doc_objs = re.findall(r"\[\[autodoc\]\]\s+(\S+)\s+", content) documented_obj += [obj.split(".")[-1] for obj in raw_doc_objs] - return documented_obj + + for obj in raw_doc_objs: + obj_public_methods = re.findall(rf"\[\[autodoc\]\] {obj}((\n\s+-.*)+)", content) + # Some objects have no methods documented + if len(obj_public_methods) == 0: + continue + else: + documented_methods_map[obj] = re.findall(r"(?<=-\s).*", obj_public_methods[0][0]) + + return documented_obj, documented_methods_map # One good reason for not being documented is to be deprecated. Put in this list deprecated objects. @@ -1057,7 +1066,7 @@ def ignore_undocumented(name: str) -> bool: def check_all_objects_are_documented(): """Check all models are properly documented.""" - documented_objs = find_all_documented_objects() + documented_objs, documented_methods_map = find_all_documented_objects() modules = transformers._modules objects = [c for c in dir(transformers) if c not in modules and not c.startswith("_")] undocumented_objs = [c for c in objects if c not in documented_objs and not ignore_undocumented(c)] @@ -1066,8 +1075,57 @@ def check_all_objects_are_documented(): "The following objects are in the public init so should be documented:\n - " + "\n - ".join(undocumented_objs) ) - check_docstrings_are_in_md() check_model_type_doc_match() + check_public_method_exists(documented_methods_map) + + +def check_public_method_exists(documented_methods_map): + """Check that all explicitly documented public methods are defined in the corresponding class.""" + failures = [] + for obj, methods in documented_methods_map.items(): + # Let's ensure there is no repetition + if len(set(methods)) != len(methods): + failures.append(f"Error in the documentation of {obj}: there are repeated documented methods.") + + # Navigates into the object, given the full import path + nested_path = obj.split(".") + submodule = transformers + if len(nested_path) > 1: + nested_submodules = nested_path[:-1] + for submodule_name in nested_submodules: + if submodule_name == "transformers": + continue + + try: + submodule = getattr(submodule, submodule_name) + except AttributeError: + failures.append(f"Could not parse {submodule_name}. Are the required dependencies installed?") + continue + + class_name = nested_path[-1] + + try: + obj_class = getattr(submodule, class_name) + except AttributeError: + failures.append(f"Could not parse {submodule_name}. Are the required dependencies installed?") + continue + + # Checks that all explicitly documented methods are defined in the class + for method in methods: + if method == "all": # Special keyword to document all public methods + continue + try: + if not hasattr(obj_class, method): + failures.append( + "The following public method is explicitly documented but not defined in the corresponding " + f"class. class: {obj}, method: {method}. If the method is defined, this error can be due to " + f"lacking dependencies." + ) + except ImportError: + pass + + if len(failures) > 0: + raise Exception("\n".join(failures)) def check_model_type_doc_match(): @@ -1097,50 +1155,6 @@ def check_model_type_doc_match(): ) -# Re pattern to catch :obj:`xx`, :class:`xx`, :func:`xx` or :meth:`xx`. -_re_rst_special_words = re.compile(r":(?:obj|func|class|meth):`([^`]+)`") -# Re pattern to catch things between double backquotes. -_re_double_backquotes = re.compile(r"(^|[^`])``([^`]+)``([^`]|$)") -# Re pattern to catch example introduction. -_re_rst_example = re.compile(r"^\s*Example.*::\s*$", flags=re.MULTILINE) - - -def is_rst_docstring(docstring: str) -> True: - """ - Returns `True` if `docstring` is written in rst. - """ - if _re_rst_special_words.search(docstring) is not None: - return True - if _re_double_backquotes.search(docstring) is not None: - return True - if _re_rst_example.search(docstring) is not None: - return True - return False - - -def check_docstrings_are_in_md(): - """Check all docstrings are written in md and nor rst.""" - files_with_rst = [] - for file in Path(PATH_TO_TRANSFORMERS).glob("**/*.py"): - with open(file, encoding="utf-8") as f: - code = f.read() - docstrings = code.split('"""') - - for idx, docstring in enumerate(docstrings): - if idx % 2 == 0 or not is_rst_docstring(docstring): - continue - files_with_rst.append(file) - break - - if len(files_with_rst) > 0: - raise ValueError( - "The following files have docstrings written in rst:\n" - + "\n".join([f"- {f}" for f in files_with_rst]) - + "\nTo fix this run `doc-builder convert path_to_py_file` after installing `doc-builder`\n" - "(`pip install git+https://github.com/huggingface/doc-builder`)" - ) - - def check_deprecated_constant_is_up_to_date(): """ Check if the constant `DEPRECATED_MODELS` in `models/auto/configuration_auto.py` is up to date. @@ -1171,29 +1185,30 @@ def check_deprecated_constant_is_up_to_date(): def check_repo_quality(): - """Check all models are properly tested and documented.""" - print("Checking all models are included.") + """Check all models are tested and documented.""" + print("Repository-wide checks:") + print(" - checking all models are included.") check_model_list() - print("Checking all models are public.") + print(" - checking all models are public.") check_models_are_in_init() - print("Checking all models are properly tested.") + print(" - checking all models have tests.") check_all_decorator_order() check_all_models_are_tested() - print("Checking all objects are properly documented.") + print(" - checking all objects have documentation.") check_all_objects_are_documented() - print("Checking all models are in at least one auto class.") + print(" - checking all models are in at least one auto class.") check_all_models_are_auto_configured() - print("Checking all names in auto name mappings are defined.") + print(" - checking all names in auto name mappings are defined.") check_all_auto_object_names_being_defined() - print("Checking all keys in auto name mappings are defined in `CONFIG_MAPPING_NAMES`.") + print(" - checking all keys in auto name mappings are defined in `CONFIG_MAPPING_NAMES`.") check_all_auto_mapping_names_in_config_mapping_names() - print("Checking all auto mappings could be imported.") + print(" - checking all auto mappings could be imported.") check_all_auto_mappings_importable() - print("Checking all objects are equally (across frameworks) in the main __init__.") + print(" - checking all objects are equally (across frameworks) in the main __init__.") check_objects_being_equally_in_main_init() - print("Checking the DEPRECATED_MODELS constant is up to date.") + print(" - checking the DEPRECATED_MODELS constant is up to date.") check_deprecated_constant_is_up_to_date() if __name__ == "__main__": - check_repo_quality() \ No newline at end of file + check_repo_quality()