From 66ab300aaff9ef509f8736cf186ab9b6a0ef4f3b Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 5 Dec 2024 19:12:22 +0100 Subject: [PATCH 01/62] Dev version --- examples/flax/question-answering/run_qa.py | 2 +- .../speech-recognition/run_flax_speech_recognition_seq2seq.py | 2 +- examples/flax/text-classification/run_flax_glue.py | 2 +- examples/flax/token-classification/run_flax_ner.py | 2 +- .../pytorch/audio-classification/run_audio_classification.py | 2 +- examples/pytorch/contrastive-image-text/run_clip.py | 2 +- .../pytorch/image-classification/run_image_classification.py | 2 +- .../image-classification/run_image_classification_no_trainer.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- examples/pytorch/image-pretraining/run_mim_no_trainer.py | 2 +- .../pytorch/instance-segmentation/run_instance_segmentation.py | 2 +- .../run_instance_segmentation_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- examples/pytorch/multiple-choice/run_swag.py | 2 +- examples/pytorch/multiple-choice/run_swag_no_trainer.py | 2 +- examples/pytorch/object-detection/run_object_detection.py | 2 +- .../pytorch/object-detection/run_object_detection_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa.py | 2 +- examples/pytorch/question-answering/run_qa_beam_search.py | 2 +- .../pytorch/question-answering/run_qa_beam_search_no_trainer.py | 2 +- examples/pytorch/question-answering/run_qa_no_trainer.py | 2 +- examples/pytorch/question-answering/run_seq2seq_qa.py | 2 +- .../pytorch/semantic-segmentation/run_semantic_segmentation.py | 2 +- .../run_semantic_segmentation_no_trainer.py | 2 +- .../pytorch/speech-recognition/run_speech_recognition_ctc.py | 2 +- .../speech-recognition/run_speech_recognition_ctc_adapter.py | 2 +- .../speech-recognition/run_speech_recognition_seq2seq.py | 2 +- examples/pytorch/summarization/run_summarization.py | 2 +- examples/pytorch/summarization/run_summarization_no_trainer.py | 2 +- examples/pytorch/text-classification/run_classification.py | 2 +- examples/pytorch/text-classification/run_glue.py | 2 +- examples/pytorch/text-classification/run_glue_no_trainer.py | 2 +- examples/pytorch/text-classification/run_xnli.py | 2 +- examples/pytorch/token-classification/run_ner.py | 2 +- examples/pytorch/token-classification/run_ner_no_trainer.py | 2 +- examples/pytorch/translation/run_translation.py | 2 +- examples/pytorch/translation/run_translation_no_trainer.py | 2 +- examples/tensorflow/contrastive-image-text/run_clip.py | 2 +- .../tensorflow/image-classification/run_image_classification.py | 2 +- examples/tensorflow/multiple-choice/run_swag.py | 2 +- examples/tensorflow/question-answering/run_qa.py | 2 +- examples/tensorflow/summarization/run_summarization.py | 2 +- examples/tensorflow/text-classification/run_glue.py | 2 +- examples/tensorflow/translation/run_translation.py | 2 +- setup.py | 2 +- src/transformers/__init__.py | 2 +- 53 files changed, 53 insertions(+), 53 deletions(-) diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py index 25a8706d869bcf..ee155e377e4137 100644 --- a/examples/flax/question-answering/run_qa.py +++ b/examples/flax/question-answering/run_qa.py @@ -61,7 +61,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py index c0085c9f4bb88c..095af99efffccc 100644 --- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py +++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risk. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt") diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py index 9ffbb82cd3aae6..ddbde78f703ce2 100755 --- a/examples/flax/text-classification/run_flax_glue.py +++ b/examples/flax/text-classification/run_flax_glue.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") Array = Any Dataset = datasets.arrow_dataset.Dataset diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py index 9ffaade205611c..5f1988c36de17c 100644 --- a/examples/flax/token-classification/run_flax_ner.py +++ b/examples/flax/token-classification/run_flax_ner.py @@ -57,7 +57,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/audio-classification/run_audio_classification.py b/examples/pytorch/audio-classification/run_audio_classification.py index cfbc4d83d93c49..ef308316569b79 100644 --- a/examples/pytorch/audio-classification/run_audio_classification.py +++ b/examples/pytorch/audio-classification/run_audio_classification.py @@ -45,7 +45,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt") diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 8353333ef827ed..d42fb52d5c13c0 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -54,7 +54,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index aa1cd089ef5c7f..41b843aade9c66 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -56,7 +56,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/pytorch/image-classification/run_image_classification_no_trainer.py b/examples/pytorch/image-classification/run_image_classification_no_trainer.py index a7193e23dddc39..6cbcac0a7e688a 100644 --- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py +++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 90b30c60e78411..f23e55191709a5 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -43,7 +43,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 773038f445cc40..9d052076b7b162 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/image-pretraining/run_mim_no_trainer.py b/examples/pytorch/image-pretraining/run_mim_no_trainer.py index 5f38481db2315c..100a1365c2e9ea 100644 --- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py +++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py @@ -53,7 +53,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation.py b/examples/pytorch/instance-segmentation/run_instance_segmentation.py index 368296709f699a..806330fb72d107 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py @@ -46,7 +46,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py index d8bb9d6f235e61..d888b7853dd475 100644 --- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py +++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py @@ -52,7 +52,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index d3f8ad8da97f3c..10bfee8f25f775 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -55,7 +55,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index 15538b2ef2e302..078b0add065ceb 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index 9d0e0008839d99..cac845f3a055df 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -58,7 +58,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 0af6d61107db66..0a0e10511fa236 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -60,7 +60,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 4b615fdc4cf1d5..8cb30099491a3a 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -54,7 +54,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 13a1f7a0d86231..0bff38707d567a 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 1c2b7ecf9905ef..20763558a5f626 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py index ea6c4a0e317eec..f188e4e476a208 100755 --- a/examples/pytorch/multiple-choice/run_swag.py +++ b/examples/pytorch/multiple-choice/run_swag.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py index 2f6723907957f5..2d4e8bdbb92c06 100755 --- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py +++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) # You should update this to your particular problem to have better documentation of `model_type` diff --git a/examples/pytorch/object-detection/run_object_detection.py b/examples/pytorch/object-detection/run_object_detection.py index 9111874438648a..07fcb36acb1583 100644 --- a/examples/pytorch/object-detection/run_object_detection.py +++ b/examples/pytorch/object-detection/run_object_detection.py @@ -48,7 +48,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt") diff --git a/examples/pytorch/object-detection/run_object_detection_no_trainer.py b/examples/pytorch/object-detection/run_object_detection_no_trainer.py index f312d0ce8a1f1a..33ad0499301e18 100644 --- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py +++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py @@ -51,7 +51,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logging.basicConfig(level=logging.INFO) logger = get_logger(__name__) diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py index 3159a79c7e5525..6de464f4367072 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py index 2fc71e0666be23..c3e12ac9edef16 100755 --- a/examples/pytorch/question-answering/run_qa_beam_search.py +++ b/examples/pytorch/question-answering/run_qa_beam_search.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py index 3b7d607933c38a..8e791564b007bf 100644 --- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py index a8f1fc10b9c540..6ccce481b548a9 100755 --- a/examples/pytorch/question-answering/run_qa_no_trainer.py +++ b/examples/pytorch/question-answering/run_qa_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py index b0bcb940e5186f..c1874b3fe18e12 100644 --- a/examples/pytorch/question-answering/run_seq2seq_qa.py +++ b/examples/pytorch/question-answering/run_seq2seq_qa.py @@ -46,7 +46,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 46f2fa45a246cc..6f77f82564172d 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index a0ce4d0f75c639..16e64eb92343f1 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py index 78c798fd471309..9eb3498c8c174e 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py index 4d9bb7780420d8..682d3b16d216a3 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py @@ -53,7 +53,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py index aa03dacd981d6e..e6a643e4213910 100755 --- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py +++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py index 9c4c2ac13d44c2..a2d09f20004704 100755 --- a/examples/pytorch/summarization/run_summarization.py +++ b/examples/pytorch/summarization/run_summarization.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py index 3d38e35aac59e2..ab5ab7adb19c44 100644 --- a/examples/pytorch/summarization/run_summarization_no_trainer.py +++ b/examples/pytorch/summarization/run_summarization_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index e7a186836fb82c..dae845b119b172 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py index 90acf81a36a408..2a99bc42e1195a 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 7fcdf81fa861ed..8da7e86d87551f 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) diff --git a/examples/pytorch/text-classification/run_xnli.py b/examples/pytorch/text-classification/run_xnli.py index b058b6f74fdc5c..c76f83ce4def6f 100755 --- a/examples/pytorch/text-classification/run_xnli.py +++ b/examples/pytorch/text-classification/run_xnli.py @@ -48,7 +48,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py index c8cb098e344b0a..056db7167280ed 100755 --- a/examples/pytorch/token-classification/run_ner.py +++ b/examples/pytorch/token-classification/run_ner.py @@ -49,7 +49,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py index 0646af80bdc71d..56e3a1e646db34 100755 --- a/examples/pytorch/token-classification/run_ner_no_trainer.py +++ b/examples/pytorch/token-classification/run_ner_no_trainer.py @@ -56,7 +56,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt") diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py index ea37b9c51e6769..dadfcb80941e27 100755 --- a/examples/pytorch/translation/run_translation.py +++ b/examples/pytorch/translation/run_translation.py @@ -52,7 +52,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py index ba1f15dd83edd4..df4c1e9557a982 100644 --- a/examples/pytorch/translation/run_translation_no_trainer.py +++ b/examples/pytorch/translation/run_translation_no_trainer.py @@ -57,7 +57,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = get_logger(__name__) require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt") diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py index 20a01a46f21a59..01c31de8730b12 100644 --- a/examples/tensorflow/contrastive-image-text/run_clip.py +++ b/examples/tensorflow/contrastive-image-text/run_clip.py @@ -51,7 +51,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version( "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt" diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 78655e7d6bc3d5..296c70549bda1b 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -55,7 +55,7 @@ logger = logging.getLogger(__name__) # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt") diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py index cbd4400580d9d4..b35d761d8a6a6a 100644 --- a/examples/tensorflow/multiple-choice/run_swag.py +++ b/examples/tensorflow/multiple-choice/run_swag.py @@ -50,7 +50,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py index f9c6de0e42bec2..a78a5d89e19f6c 100755 --- a/examples/tensorflow/question-answering/run_qa.py +++ b/examples/tensorflow/question-answering/run_qa.py @@ -62,7 +62,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") logger = logging.getLogger(__name__) diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py index 92ebd0e1d77533..92a10990d160ed 100644 --- a/examples/tensorflow/summarization/run_summarization.py +++ b/examples/tensorflow/summarization/run_summarization.py @@ -53,7 +53,7 @@ # region Checking dependencies # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py index a51939d8d58801..a8f2de825cc2e4 100644 --- a/examples/tensorflow/text-classification/run_glue.py +++ b/examples/tensorflow/text-classification/run_glue.py @@ -47,7 +47,7 @@ # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") task_to_keys = { "cola": ("sentence", None), diff --git a/examples/tensorflow/translation/run_translation.py b/examples/tensorflow/translation/run_translation.py index 50189345d56890..1afb72cf10984e 100644 --- a/examples/tensorflow/translation/run_translation.py +++ b/examples/tensorflow/translation/run_translation.py @@ -56,7 +56,7 @@ # region Dependencies and constants # Will error if the minimal version of Transformers is not installed. Remove at your own risks. -check_min_version("4.47.0.dev0") +check_min_version("4.48.0.dev0") require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt") diff --git a/setup.py b/setup.py index 2364e3b677fd6d..a9babfaeea67ab 100644 --- a/setup.py +++ b/setup.py @@ -435,7 +435,7 @@ def run(self): setup( name="transformers", - version="4.47.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) + version="4.48.0.dev0", # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots) author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)", author_email="transformers@huggingface.co", description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow", diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 625936a45869c8..23940a240876b9 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -18,7 +18,7 @@ # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names # in the namespace without actually importing anything (and especially none of the backends). -__version__ = "4.47.0.dev0" +__version__ = "4.48.0.dev0" from typing import TYPE_CHECKING From 44f88d8ccbb5158fea19e9bd0c5c06aa57e5328b Mon Sep 17 00:00:00 2001 From: Jacky Lee <39754370+jla524@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:06:54 -0800 Subject: [PATCH 02/62] [docs] Update Python version in translations (#35096) update: doc version --- i18n/README_ar.md | 2 +- i18n/README_de.md | 2 +- i18n/README_es.md | 2 +- i18n/README_fr.md | 2 +- i18n/README_hd.md | 2 +- i18n/README_ja.md | 2 +- i18n/README_pt-br.md | 2 +- i18n/README_ru.md | 2 +- i18n/README_te.md | 2 +- i18n/README_ur.md | 10 +++++----- i18n/README_vi.md | 2 +- i18n/README_zh-hant.md | 2 +- 12 files changed, 16 insertions(+), 16 deletions(-) diff --git a/i18n/README_ar.md b/i18n/README_ar.md index c2dd588fdb233f..8160ec908d4411 100644 --- a/i18n/README_ar.md +++ b/i18n/README_ar.md @@ -245,7 +245,7 @@ limitations under the License. ### باستخدام pip -تم اختبار هذا المستودع على Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+. +تم اختبار هذا المستودع على Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، و TensorFlow 2.6+. يجب تثبيت 🤗 Transformers في [بيئة افتراضية](https://docs.python.org/3/library/venv.html). إذا كنت غير معتاد على البيئات الافتراضية Python، فراجع [دليل المستخدم](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_de.md b/i18n/README_de.md index 2532c9e12fab59..ccc9e6111a25f0 100644 --- a/i18n/README_de.md +++ b/i18n/README_de.md @@ -246,7 +246,7 @@ Das Modell selbst ist ein reguläres [PyTorch `nn.Module`](https://pytorch.org/d ### Mit pip -Dieses Repository wurde mit Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ und TensorFlow 2.6+ getestet. +Dieses Repository wurde mit Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ und TensorFlow 2.6+ getestet. Sie sollten 🤗 Transformers in einer [virtuellen Umgebung](https://docs.python.org/3/library/venv.html) installieren. Wenn Sie mit virtuellen Python-Umgebungen nicht vertraut sind, schauen Sie sich den [Benutzerleitfaden](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) an. diff --git a/i18n/README_es.md b/i18n/README_es.md index 6682147d7867cf..5d5ba1b3249785 100644 --- a/i18n/README_es.md +++ b/i18n/README_es.md @@ -222,7 +222,7 @@ El modelo en si es un [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.h ### Con pip -Este repositorio está probado en Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ y TensorFlow 2.6+. +Este repositorio está probado en Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ y TensorFlow 2.6+. Deberías instalar 🤗 Transformers en un [entorno virtual](https://docs.python.org/3/library/venv.html). Si no estas familiarizado con los entornos virtuales de Python, consulta la [guía de usuario](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_fr.md b/i18n/README_fr.md index c1eaa10edb927d..97b11166b301a1 100644 --- a/i18n/README_fr.md +++ b/i18n/README_fr.md @@ -243,7 +243,7 @@ Le modèle lui-même est un module [`nn.Module` PyTorch](https://pytorch.org/doc ### Avec pip -Ce référentiel est testé sur Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ et TensorFlow 2.6+. +Ce référentiel est testé sur Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ et TensorFlow 2.6+. Vous devriez installer 🤗 Transformers dans un [environnement virtuel](https://docs.python.org/3/library/venv.html). Si vous n'êtes pas familier avec les environnements virtuels Python, consultez le [guide utilisateur](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_hd.md b/i18n/README_hd.md index 07077e5dd9c37d..17efdd21eb04dc 100644 --- a/i18n/README_hd.md +++ b/i18n/README_hd.md @@ -198,7 +198,7 @@ checkpoint: जाँच बिंदु ### पिप का उपयोग करना -इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ और TensorFlow 2.6+ के तहत किया गया है। +इस रिपॉजिटरी का परीक्षण Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ और TensorFlow 2.6+ के तहत किया गया है। आप [वर्चुअल एनवायरनमेंट](https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें। diff --git a/i18n/README_ja.md b/i18n/README_ja.md index 293a5ee111b0c7..3d417098ea314d 100644 --- a/i18n/README_ja.md +++ b/i18n/README_ja.md @@ -256,7 +256,7 @@ Hugging Faceチームによって作られた **[トランスフォーマーを ### pipにて -このリポジトリは、Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+ でテストされています。 +このリポジトリは、Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, TensorFlow 2.6+ でテストされています。 🤗Transformersは[仮想環境](https://docs.python.org/3/library/venv.html)にインストールする必要があります。Pythonの仮想環境に慣れていない場合は、[ユーザーガイド](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)を確認してください。 diff --git a/i18n/README_pt-br.md b/i18n/README_pt-br.md index 79007e5aaa33f9..d9248f9a151c36 100644 --- a/i18n/README_pt-br.md +++ b/i18n/README_pt-br.md @@ -253,7 +253,7 @@ O modelo em si é um [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.ht ### Com pip -Este repositório é testado no Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ e TensorFlow 2.6+. +Este repositório é testado no Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ e TensorFlow 2.6+. Você deve instalar o 🤗 Transformers em um [ambiente virtual](https://docs.python.org/3/library/venv.html). Se você não está familiarizado com ambientes virtuais em Python, confira o [guia do usuário](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_ru.md b/i18n/README_ru.md index ebab1236ca8542..a359b52d2ccc73 100644 --- a/i18n/README_ru.md +++ b/i18n/README_ru.md @@ -244,7 +244,7 @@ Hugging Face Hub. Мы хотим, чтобы Transformers позволил ра ### С помощью pip -Данный репозиторий протестирован на Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ и TensorFlow 2.6+. +Данный репозиторий протестирован на Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ и TensorFlow 2.6+. Устанавливать 🤗 Transformers следует в [виртуальной среде](https://docs.python.org/3/library/venv.html). Если вы не знакомы с виртуальными средами Python, ознакомьтесь с [руководством пользователя](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_te.md b/i18n/README_te.md index feb537ad1a48d2..a9795e9ca326aa 100644 --- a/i18n/README_te.md +++ b/i18n/README_te.md @@ -246,7 +246,7 @@ limitations under the License. ### పిప్ తో -ఈ రిపోజిటరీ పైథాన్ 3.8+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.11+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. +ఈ రిపోజిటరీ పైథాన్ 3.9+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.11+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది. మీరు [వర్చువల్ వాతావరణం](https://docs.python.org/3/library/venv.html)లో 🤗 ట్రాన్స్‌ఫార్మర్‌లను ఇన్‌స్టాల్ చేయాలి. మీకు పైథాన్ వర్చువల్ పరిసరాల గురించి తెలియకుంటే, [యూజర్ గైడ్](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) చూడండి. diff --git a/i18n/README_ur.md b/i18n/README_ur.md index e14c8707770791..cc37b5cfc4223d 100644 --- a/i18n/README_ur.md +++ b/i18n/README_ur.md @@ -91,7 +91,7 @@ limitations under the License. - [‏BART کے ساتھ خلاصہ کاری](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct) - [‏DistilBERT کے ساتھ سوالات کے جوابات](https://huggingface.co/distilbert/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species) - [‏T5 کے ساتھ ترجمہ](https://huggingface.co/google-t5/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin) - + کمپیوٹر وژن میں: - [‏ViT کے ساتھ امیج کی درجہ بندی](https://huggingface.co/google/vit-base-patch16-224) - [‏DETR کے ساتھ اشیاء کی شناخت](https://huggingface.co/facebook/detr-resnet-50) @@ -224,7 +224,7 @@ limitations under the License. ## مجھے Transformers کیوں استعمال کرنا چاہیے؟ ‏ 1. استعمال میں آسان جدید ترین ماڈلز: - + - قدرتی زبان کی سمجھ اور تخلیق، کمپیوٹر وژن، اور آڈیو کے کاموں میں اعلی کارکردگی۔ - معلمین اور عملی ماہرین کے لیے کم داخلی رکاوٹ۔ - سیکھنے کے لیے صرف تین کلاسز کے ساتھ چند یوزر فرینڈلی ایبسٹریکشنز۔ @@ -236,7 +236,7 @@ limitations under the License. - عملی ماہرین کمپیوٹ وقت اور پروڈکشن اخراجات کو کم کر سکتے ہیں۔ - ہر موڈیلٹی کے لیے 400,000 سے زیادہ pretrained ماڈلز کے ساتھ درجنوں آرکیٹیکچرز۔ -‏ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح +‏ 3. ماڈل کے لائف ٹائم کے ہر حصے کے لیے صحیح فریم ورک کا انتخاب کریں: - 3 لائنز کے کوڈ میں جدید ترین ماڈلز تربیت دیں۔ @@ -259,7 +259,7 @@ limitations under the License. #### ‏ pip کے ساتھ -یہ ریپوزٹری Python 3.8+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ +یہ ریپوزٹری Python 3.9+، Flax 0.4.1+، PyTorch 1.11+، اور TensorFlow 2.6+ پر ٹیسٹ کی گئی ہے۔ آپ کو 🤗 Transformers کو ایک [ورچوئل ماحول](https://docs.python.org/3/library/venv.html) میں انسٹال کرنا چاہیے۔ اگر آپ Python ورچوئل ماحول سے واقف نہیں ہیں، تو [یوزر گائیڈ](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) دیکھیں۔ @@ -290,7 +290,7 @@ Flax، PyTorch، یا TensorFlow کو conda کے ساتھ انسٹال کرنے > **_نوٹ:_** ونڈوز پر، آپ کو کیشنگ سے فائدہ اٹھانے کے لیے ڈویلپر موڈ کو ایکٹیویٹ کرنے کا پیغام دیا جا سکتا ہے۔ اگر یہ آپ کے لیے ممکن نہیں ہے، تو براہ کرم ہمیں [اس مسئلے](https://github.com/huggingface/huggingface_hub/issues/1062) میں بتائیں۔ -### ماڈل کی تعمیرات +### ماڈل کی تعمیرات ‏ 🤗 Transformers کی طرف سے فراہم کردہ **[تمام ماڈل چیک پوائنٹس](https://huggingface.co/models)** ہگنگ فیس کے ماڈل حب [model hub](https://huggingface.co/models) سے بآسانی مربوط ہیں، جہاں یہ براہ راست [صارفین](https://huggingface.co/users) اور [تنظیموں](https://huggingface.co/organizations) کے ذریعہ اپ لوڈ کیے جاتے ہیں۔ diff --git a/i18n/README_vi.md b/i18n/README_vi.md index 5e5c2ab1e25cf7..f523c282b680c4 100644 --- a/i18n/README_vi.md +++ b/i18n/README_vi.md @@ -245,7 +245,7 @@ Chính mô hình là một [Pytorch `nn.Module`](https://pytorch.org/docs/stable ### Sử dụng pip -Thư viện này được kiểm tra trên Python 3.8+, Flax 0.4.1+, PyTorch 1.11+ và TensorFlow 2.6+. +Thư viện này được kiểm tra trên Python 3.9+, Flax 0.4.1+, PyTorch 1.11+ và TensorFlow 2.6+. Bạn nên cài đặt 🤗 Transformers trong một [môi trường ảo Python](https://docs.python.org/3/library/venv.html). Nếu bạn chưa quen với môi trường ảo Python, hãy xem [hướng dẫn sử dụng](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). diff --git a/i18n/README_zh-hant.md b/i18n/README_zh-hant.md index e20798a2d4571f..87c623ee84a61b 100644 --- a/i18n/README_zh-hant.md +++ b/i18n/README_zh-hant.md @@ -210,7 +210,7 @@ Tokenizer 為所有的預訓練模型提供了預處理,並可以直接轉換 ### 使用 pip -這個 Repository 已在 Python 3.8+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下經過測試。 +這個 Repository 已在 Python 3.9+、Flax 0.4.1+、PyTorch 1.11+ 和 TensorFlow 2.6+ 下經過測試。 你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境,請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。 From 98e8062df3cf82b367e8a243963e4afb0d9d3407 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 5 Dec 2024 11:24:51 -0800 Subject: [PATCH 03/62] [docs] top_p, top_k, temperature docstrings (#35065) clarify --- src/transformers/generation/configuration_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py index 0a6fdd9fb51a5f..18cf26b8b73415 100644 --- a/src/transformers/generation/configuration_utils.py +++ b/src/transformers/generation/configuration_utils.py @@ -195,12 +195,12 @@ class GenerationConfig(PushToHubMixin): > Parameters for manipulation of the model output logits temperature (`float`, *optional*, defaults to 1.0): - The value used to modulate the next token probabilities. + The value used to module the next token probabilities. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0 top_k (`int`, *optional*, defaults to 50): - The number of highest probability vocabulary tokens to keep for top-k-filtering. + The number of highest probability vocabulary tokens to keep for top-k-filtering. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 50. top_p (`float`, *optional*, defaults to 1.0): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to - `top_p` or higher are kept for generation. + `top_p` or higher are kept for generation. This value is set in a model's `generation_config.json` file. If it isn't set, the default value is 1.0 min_p (`float`, *optional*): Minimum token probability, which will be scaled by the probability of the most likely token. It must be a value between 0 and 1. Typical values are in the 0.01-0.2 range, comparably selective as setting `top_p` in From 15ab310c3a5318530b15f83ac5bf57c3718d1b2f Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 6 Dec 2024 12:03:31 +0100 Subject: [PATCH 04/62] Fix private forked repo. CI (#35114) fix Co-authored-by: ydshieh --- .circleci/create_circleci_config.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/.circleci/create_circleci_config.py b/.circleci/create_circleci_config.py index 7ccf5ec96cec4f..84c0f65166baef 100644 --- a/.circleci/create_circleci_config.py +++ b/.circleci/create_circleci_config.py @@ -133,7 +133,7 @@ def to_dict(self): "command": """dpkg-query --show --showformat='${Installed-Size}\t${Package}\n' | sort -rh | head -25 | sort -h | awk '{ package=$2; sub(".*/", "", package); printf("%.5f GB %s\n", $1/1024/1024, package)}' || true"""} }, {"run": {"name": "Create `test-results` directory", "command": "mkdir test-results"}}, - {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <>' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, + {"run": {"name": "Get files to test", "command":f'curl -L -o {self.job_name}_test_list.txt <> --header "Circle-Token: $CIRCLE_TOKEN"' if self.name != "pr_documentation_tests" else 'echo "Skipped"'}}, {"run": {"name": "Split tests across parallel nodes: show current parallel tests", "command": f"TESTS=$(circleci tests split --split-by=timings {self.job_name}_test_list.txt) && echo $TESTS > splitted_tests.txt && echo $TESTS | tr ' ' '\n'" if self.parallelism else f"awk '{{printf \"%s \", $0}}' {self.job_name}_test_list.txt > splitted_tests.txt" } @@ -371,9 +371,14 @@ def create_circleci_config(folder=None): **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs}, **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs}, }, - "jobs" : {j.job_name: j.to_dict() for j in jobs}, - "workflows": {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} + "jobs" : {j.job_name: j.to_dict() for j in jobs} } + if "CIRCLE_TOKEN" in os.environ: + # For private forked repo. (e.g. new model addition) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [{j.job_name: {"context": ["TRANSFORMERS_CONTEXT"]}} for j in jobs]}} + else: + # For public repo. (e.g. `transformers`) + config["workflows"] = {"version": 2, "run_tests": {"jobs": [j.job_name for j in jobs]}} with open(os.path.join(folder, "generated_config.yml"), "w") as f: f.write(yaml.dump(config, sort_keys=False, default_flow_style=False).replace("' << pipeline", " << pipeline").replace(">> '", " >>")) From 9ad4c93536855d78bcc3ea56a95bd53dc95d1a8e Mon Sep 17 00:00:00 2001 From: Aymeric Roucher <69208727+aymeric-roucher@users.noreply.github.com> Date: Fri, 6 Dec 2024 12:17:34 +0100 Subject: [PATCH 05/62] Add Aria (#34157) * Add Aria --------- Co-authored-by: Cyril Vallez Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 3 + docs/source/en/model_doc/aria.md | 106 + docs/source/en/model_doc/idefics3.md | 7 + docs/source/en/perf_infer_gpu_one.md | 2 + src/transformers/__init__.py | 32 + src/transformers/generation/utils.py | 1 + src/transformers/models/__init__.py | 1 + src/transformers/models/aria/__init__.py | 30 + .../models/aria/configuration_aria.py | 299 +++ .../models/aria/convert_aria_weights_to_hf.py | 162 ++ .../models/aria/image_processing_aria.py | 504 +++++ src/transformers/models/aria/modeling_aria.py | 1920 +++++++++++++++++ src/transformers/models/aria/modular_aria.py | 1598 ++++++++++++++ .../models/aria/processing_aria.py | 164 ++ .../models/auto/configuration_auto.py | 8 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 5 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + src/transformers/models/idefics3/__init__.py | 6 +- src/transformers/utils/dummy_pt_objects.py | 49 + .../utils/dummy_vision_objects.py | 7 + tests/generation/test_utils.py | 1 + tests/models/aria/__init__.py | 0 .../models/aria/test_image_processing_aria.py | 268 +++ tests/models/aria/test_modeling_aria.py | 669 ++++++ tests/models/aria/test_processor_aria.py | 391 ++++ utils/check_config_attributes.py | 2 +- utils/check_docstrings.py | 7 +- utils/check_repo.py | 2 + utils/modular_model_converter.py | 2 +- 32 files changed, 6244 insertions(+), 7 deletions(-) create mode 100644 docs/source/en/model_doc/aria.md create mode 100644 src/transformers/models/aria/__init__.py create mode 100644 src/transformers/models/aria/configuration_aria.py create mode 100644 src/transformers/models/aria/convert_aria_weights_to_hf.py create mode 100644 src/transformers/models/aria/image_processing_aria.py create mode 100644 src/transformers/models/aria/modeling_aria.py create mode 100644 src/transformers/models/aria/modular_aria.py create mode 100644 src/transformers/models/aria/processing_aria.py create mode 100644 tests/models/aria/__init__.py create mode 100644 tests/models/aria/test_image_processing_aria.py create mode 100644 tests/models/aria/test_modeling_aria.py create mode 100644 tests/models/aria/test_processor_aria.py diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 3521d4ccfed894..6e325e499f342d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -810,6 +810,8 @@ title: ALIGN - local: model_doc/altclip title: AltCLIP + - local: model_doc/aria + title: Aria - local: model_doc/blip title: BLIP - local: model_doc/blip-2 diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 3cad4e663f23fd..181ec8b10fb1fc 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -62,6 +62,8 @@ Flax), PyTorch, and/or TensorFlow. | [ALBERT](model_doc/albert) | ✅ | ✅ | ✅ | | [ALIGN](model_doc/align) | ✅ | ❌ | ❌ | | [AltCLIP](model_doc/altclip) | ✅ | ❌ | ❌ | +| [Aria](model_doc/aria) | ✅ | ❌ | ❌ | +| [AriaText](model_doc/aria_text) | ✅ | ❌ | ❌ | | [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) | ✅ | ❌ | ❌ | | [Autoformer](model_doc/autoformer) | ✅ | ❌ | ❌ | | [Bark](model_doc/bark) | ✅ | ❌ | ❌ | @@ -172,6 +174,7 @@ Flax), PyTorch, and/or TensorFlow. | [IDEFICS](model_doc/idefics) | ✅ | ✅ | ❌ | | [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [Idefics3](model_doc/idefics3) | ✅ | ❌ | ❌ | +| [Idefics3VisionTransformer](model_doc/idefics3_vision) | ❌ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/aria.md b/docs/source/en/model_doc/aria.md new file mode 100644 index 00000000000000..9ff7a6687aa939 --- /dev/null +++ b/docs/source/en/model_doc/aria.md @@ -0,0 +1,106 @@ + + +# Aria + +## Overview + +The Aria model was proposed in [Aria: An Open Multimodal Native Mixture-of-Experts Model](https://huggingface.co/papers/2410.05993) by Li et al. from the Rhymes.AI team. + +Aria is an open multimodal-native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. It has a Mixture-of-Experts architecture, with respectively 3.9B and 3.5B activated parameters per visual token and text token. + +The abstract from the paper is the following: + +*Information comes in diverse modalities. Multimodal native AI models are essential to integrate real-world information and deliver comprehensive understanding. While proprietary multimodal native models exist, their lack of openness imposes obstacles for adoptions, let alone adaptations. To fill this gap, we introduce Aria, an open multimodal native model with best-in-class performance across a wide range of multimodal, language, and coding tasks. Aria is a mixture-of-expert model with 3.9B and 3.5B activated parameters per visual token and text token, respectively. It outperforms Pixtral-12B and Llama3.2-11B, and is competitive against the best proprietary models on various multimodal tasks. We pre-train Aria from scratch following a 4-stage pipeline, which progressively equips the model with strong capabilities in language understanding, multimodal understanding, long context window, and instruction following. We open-source the model weights along with a codebase that facilitates easy adoptions and adaptations of Aria in real-world applications.* + +This model was contributed by [m-ric](https://huggingface.co/m-ric). +The original code can be found [here](https://github.com/rhymes-ai/Aria). + +## Usage tips + +Here's how to use the model for vision tasks: +```python +import requests +import torch +from PIL import Image + +from transformers import AriaProcessor, AriaForConditionalGeneration + +model_id_or_path = "rhymes-ai/Aria" + +model = AriaForConditionalGeneration.from_pretrained( + model_id_or_path, device_map="auto" +) + +processor = AriaProcessor.from_pretrained(model_id_or_path) + +image = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + +messages = [ + { + "role": "user", + "content": [ + {"type": "image"}, + {"text": "what is the image?", "type": "text"}, + ], + } +] + +text = processor.apply_chat_template(messages, add_generation_prompt=True) +inputs = processor(text=text, images=image, return_tensors="pt") +inputs.to(model.device) + +output = model.generate( + **inputs, + max_new_tokens=15, + stop_strings=["<|im_end|>"], + tokenizer=processor.tokenizer, + do_sample=True, + temperature=0.9, +) +output_ids = output[0][inputs["input_ids"].shape[1]:] +response = processor.decode(output_ids, skip_special_tokens=True) +``` + + +## AriaImageProcessor + +[[autodoc]] AriaImageProcessor + +## AriaProcessor + +[[autodoc]] AriaProcessor + +## AriaTextConfig + +[[autodoc]] AriaTextConfig + +## AriaConfig + +[[autodoc]] AriaConfig + +## AriaTextModel + +[[autodoc]] AriaTextModel + +## AriaTextForCausalLM + +[[autodoc]] AriaTextForCausalLM + +## AriaForConditionalGeneration + +[[autodoc]] AriaForConditionalGeneration + - forward diff --git a/docs/source/en/model_doc/idefics3.md b/docs/source/en/model_doc/idefics3.md index dfaf40477a7b52..cf7c043e928901 100644 --- a/docs/source/en/model_doc/idefics3.md +++ b/docs/source/en/model_doc/idefics3.md @@ -51,6 +51,13 @@ This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts) [[autodoc]] Idefics3Config +## Idefics3VisionConfig + +[[autodoc]] Idefics3VisionConfig + +## Idefics3VisionTransformer + +[[autodoc]] Idefics3VisionTransformer ## Idefics3Model diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index ec8dea2735b531..ab5e1c47a448f3 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -37,6 +37,7 @@ FlashAttention-2 is experimental and may change considerably in future versions. 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them FlashAttention-2 is currently supported for the following architectures: +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon) @@ -216,6 +217,7 @@ PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.o For now, Transformers supports SDPA inference and training for the following architectures: * [Albert](https://huggingface.co/docs/transformers/model_doc/albert#transformers.AlbertModel) +* [Aria](https://huggingface.co/docs/transformers/model_doc/aria#transformers.AriaForConditionalGeneration) * [Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer#transformers.ASTModel) * [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel) * [Bert](https://huggingface.co/docs/transformers/model_doc/bert#transformers.BertModel) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 23940a240876b9..2eaec8f1def96e 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -170,6 +170,11 @@ "AltCLIPTextConfig", "AltCLIPVisionConfig", ], + "models.aria": [ + "AriaConfig", + "AriaProcessor", + "AriaTextConfig", + ], "models.audio_spectrogram_transformer": [ "ASTConfig", "ASTFeatureExtractor", @@ -1176,6 +1181,7 @@ _import_structure["image_processing_base"] = ["ImageProcessingMixin"] _import_structure["image_processing_utils"] = ["BaseImageProcessor"] _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] + _import_structure["models.aria"].extend(["AriaImageProcessor"]) _import_structure["models.beit"].extend(["BeitFeatureExtractor", "BeitImageProcessor"]) _import_structure["models.bit"].extend(["BitImageProcessor"]) _import_structure["models.blip"].extend(["BlipImageProcessor"]) @@ -1406,6 +1412,15 @@ "AltCLIPVisionModel", ] ) + _import_structure["models.aria"].extend( + [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextForCausalLM", + "AriaTextModel", + "AriaTextPreTrainedModel", + ] + ) _import_structure["models.audio_spectrogram_transformer"].extend( [ "ASTForAudioClassification", @@ -2461,6 +2476,8 @@ "Idefics3Model", "Idefics3PreTrainedModel", "Idefics3Processor", + "Idefics3VisionConfig", + "Idefics3VisionTransformer", ] ) _import_structure["models.ijepa"].extend( @@ -5033,6 +5050,11 @@ AltCLIPTextConfig, AltCLIPVisionConfig, ) + from .models.aria import ( + AriaConfig, + AriaProcessor, + AriaTextConfig, + ) from .models.audio_spectrogram_transformer import ( ASTConfig, ASTFeatureExtractor, @@ -6096,6 +6118,7 @@ from .image_processing_base import ImageProcessingMixin from .image_processing_utils import BaseImageProcessor from .image_utils import ImageFeatureExtractionMixin + from .models.aria import AriaImageProcessor from .models.beit import BeitFeatureExtractor, BeitImageProcessor from .models.bit import BitImageProcessor from .models.blip import BlipImageProcessor @@ -6325,6 +6348,13 @@ AltCLIPTextModel, AltCLIPVisionModel, ) + from .models.aria import ( + AriaForConditionalGeneration, + AriaPreTrainedModel, + AriaTextForCausalLM, + AriaTextModel, + AriaTextPreTrainedModel, + ) from .models.audio_spectrogram_transformer import ( ASTForAudioClassification, ASTModel, @@ -7189,6 +7219,8 @@ Idefics3Model, Idefics3PreTrainedModel, Idefics3Processor, + Idefics3VisionConfig, + Idefics3VisionTransformer, ) from .models.ijepa import ( IJepaForImageClassification, diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 015cbebaa8e5dc..89c57cb913fec2 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1465,6 +1465,7 @@ def _prepare_generated_length( elif ( model_input_name == "inputs_embeds" and input_ids_length != inputs_tensor.shape[1] + and input_ids_length != 0 and not self.config.is_encoder_decoder ): generation_config.max_length -= inputs_tensor.shape[1] diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index e957d802d80e71..116b71c81ad9df 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -16,6 +16,7 @@ albert, align, altclip, + aria, audio_spectrogram_transformer, auto, autoformer, diff --git a/src/transformers/models/aria/__init__.py b/src/transformers/models/aria/__init__.py new file mode 100644 index 00000000000000..f73301321527c1 --- /dev/null +++ b/src/transformers/models/aria/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aria import * + from .image_processing_aria import * + from .modeling_aria import * + from .processing_aria import * + +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/aria/configuration_aria.py b/src/transformers/models/aria/configuration_aria.py new file mode 100644 index 00000000000000..ff34d59f5dfe1a --- /dev/null +++ b/src/transformers/models/aria/configuration_aria.py @@ -0,0 +1,299 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ..auto import CONFIG_MAPPING, AutoConfig + + +class AriaTextConfig(PretrainedConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `AriaTextModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_config_key = "text_config" + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size: int = 4096, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=2, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + head_dim=None, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["AriaConfig", "AriaTextConfig"] diff --git a/src/transformers/models/aria/convert_aria_weights_to_hf.py b/src/transformers/models/aria/convert_aria_weights_to_hf.py new file mode 100644 index 00000000000000..dcc9e4d1397672 --- /dev/null +++ b/src/transformers/models/aria/convert_aria_weights_to_hf.py @@ -0,0 +1,162 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import glob + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open + +from transformers import ( + AddedToken, + AriaForConditionalGeneration, + AriaProcessor, + AutoConfig, + AutoTokenizer, +) + + +EPILOG_TXT = """Example: + python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria + +Example for creating the old state dict file with Python: + + import torch + from aria.model.language_model.aria_llama import AriaTextForCausalLM + + # load model + kwargs = {"device_map": "auto", "torch_dtype": torch.float16} + model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) + + # load vision tower + model.get_vision_tower().load_model() + + # Save state dict + torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") +""" + +KEYS_TO_MODIFY_MAPPING = { + "vision_tower.vision_model": "vision_tower", + "ln_ffn": "layer_norm", + "ffn": "feed_forward", + "ln_kv": "layer_norm_kv", +} + + +def load_original_state_dict(model_id): + directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) + + original_state_dict = {} + for path in glob.glob(f"{directory_path}/*"): + if path.endswith(".safetensors"): + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + original_state_dict[key] = f.get_tensor(key) + + return original_state_dict + + +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value + new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) + new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) + + return new_state_dict + + +def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): + torch.set_default_dtype(torch.float16) + + tokenizer = AutoTokenizer.from_pretrained( + text_model_id, + extra_special_tokens={ + "image_token": "<|img|>", + "pad_token": "", + }, + ) + tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) + tokenizer.add_special_tokens({"pad_token": ""}) + tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" + + processor = AriaProcessor.from_pretrained( + text_model_id, + tokenizer=tokenizer, + ) + + config = AutoConfig.from_pretrained(text_model_id) + config.vision_config.hidden_size = 1152 + config.vision_config.attention_heads = 16 + config.pad_token_id = 2 + config.image_token_index = 9 + config.intermediate_size = config.moe_intermediate_size + config.auto_map = { + "AutoConfig": "modeling_aria.AriaConfig", + "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", + } + + with torch.device("meta"): + model = AriaForConditionalGeneration(config) + + state_dict = load_original_state_dict(old_state_dict_id) + + state_dict = convert_state_dict_to_hf(state_dict) + model.load_state_dict(state_dict, strict=False, assign=True) + + # print("Saving models") + # model.save_pretrained("local_aria", safe_serialization=False) + # processor.save_pretrained("local_aria") + print("Pushing to hub") + model.push_to_hub(output_hub_path, create_pr=True) + processor.push_to_hub(output_hub_path, create_pr=True) + + +def main(): + parser = argparse.ArgumentParser( + epilog=EPILOG_TXT, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--text_model_id", + default="rhymes-ai/Aria", + help="Hub location of the text model", + ) + parser.add_argument( + "--vision_model_id", + default="rhymes-ai/Aria", + help="Hub location of the vision model", + ) + parser.add_argument( + "--output_hub_path", + default="rhymes-ai/Aria", + help="Location on the hub of the converted model", + ) + parser.add_argument( + "--old_state_dict_id", + default="rhymes-ai/Aria", + help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", + ) + args = parser.parse_args() + convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/aria/image_processing_aria.py b/src/transformers/models/aria/image_processing_aria.py new file mode 100644 index 00000000000000..7b00665aa2859d --- /dev/null +++ b/src/transformers/models/aria/image_processing_aria.py @@ -0,0 +1,504 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_valid_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType + + +def make_batched_images(images) -> List[List[ImageInput]]: + """ + Accepts images in list or nested list format, and makes a list of images for preprocessing. + + Args: + images (`Union[List[List[ImageInput]], List[ImageInput], ImageInput]`): + The input image. + + Returns: + list: A list of images. + """ + if isinstance(images, (list, tuple)) and isinstance(images[0], (list, tuple)) and is_valid_image(images[0][0]): + return [img for img_list in images for img in img_list] + + elif isinstance(images, (list, tuple)) and is_valid_image(images[0]): + return images + + elif is_valid_image(images): + return [images] + + raise ValueError(f"Could not make batched video from {images}") + + +def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: + """ + Divides an image into patches of a specified size. + + Args: + image (`np.array`): + The input image. + patch_size (`int`): + The size of each patch. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list: A list of np.array representing the patches. + """ + patches = [] + height, width = get_image_size(image, channel_dim=input_data_format) + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + if input_data_format == ChannelDimension.LAST: + patch = image[i : i + patch_size, j : j + patch_size] + else: + patch = image[:, i : i + patch_size, j : j + patch_size] + patches.append(patch) + + return patches + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_batched_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded / 255.0, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +__all__ = ["AriaImageProcessor"] diff --git a/src/transformers/models/aria/modeling_aria.py b/src/transformers/models/aria/modeling_aria.py new file mode 100644 index 00000000000000..1b4e4087b1a49d --- /dev/null +++ b/src/transformers/models/aria/modeling_aria.py @@ -0,0 +1,1920 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs, _flash_attention_forward +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS +from ...modeling_utils import PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + LossKwargs, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_aria import AriaConfig, AriaTextConfig + + +if is_torch_available(): + import torch + from torch import nn + + +logger = logging.get_logger(__name__) +_CONFIG_FOR_DOC = "AriaTextConfig" + + +class AriaTextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + AriaTextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +class AriaSharedExpertsMLP(nn.Module): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +class AriaTextRotaryEmbedding(nn.Module): + def __init__( + self, + dim=None, + max_position_embeddings=2048, + base=10000, + device=None, + scaling_factor=1.0, + rope_type="default", + config: Optional[AriaTextConfig] = None, + ): + super().__init__() + # TODO (joao): remove the `if` below, only used for BC + self.rope_kwargs = {} + if config is None: + logger.warning_once( + "`AriaTextRotaryEmbedding` can now be fully parameterized by passing the model config through the " + "`config` argument. All other arguments will be removed in v4.46" + ) + self.rope_kwargs = { + "rope_type": rope_type, + "factor": scaling_factor, + "dim": dim, + "base": base, + "max_position_embeddings": max_position_embeddings, + } + self.rope_type = rope_type + self.max_seq_len_cached = max_position_embeddings + self.original_max_seq_len = max_position_embeddings + else: + # BC: "rope_type" was originally "type" + if config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, **self.rope_kwargs) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + def _dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn( + self.config, device, seq_len=seq_len, **self.rope_kwargs + ) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + @torch.no_grad() + def forward(self, x, position_ids): + if "dynamic" in self.rope_type: + self._dynamic_frequency_update(position_ids, device=x.device) + + # Core RoPE block + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 (see https://github.com/huggingface/transformers/pull/29285) + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + + # Advanced RoPE types (e.g. yarn) apply a post-processing scaling factor, equivalent to scaling attention + cos = cos * self.attention_scaling + sin = sin * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class AriaTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: AriaTextConfig, layer_idx: Optional[int] = None): + super().__init__() + self.config = config + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads) + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.max_position_embeddings = config.max_position_embeddings + self.rope_theta = config.rope_theta + self.is_causal = True + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias) + + # TODO (joao): remove in v4.46 (RoPE is computed in the model, not in the decoder layers) + self.rotary_emb = AriaTextRotaryEmbedding(config=self.config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + + attn_output = attn_output.reshape(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class AriaTextFlashAttention2(AriaTextAttention): + """ + AriaText flash attention module. This module inherits from `AriaTextAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if isinstance(past_key_value, StaticCache): + raise ValueError( + "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` " + "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers" + ) + + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.attention_dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (AriaTextRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + position_ids=position_ids, + dropout=dropout_rate, + sliding_window=getattr(self, "sliding_window", None), + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + **kwargs, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class AriaTextSdpaAttention(AriaTextAttention): + """ + AriaText attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from + `AriaTextAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to + SDPA API. + """ + + # Adapted from AriaTextAttention.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + if output_attentions: + # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "AriaTextModel is using AriaTextSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, " + 'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # use -1 to infer num_heads and num_key_value_heads as they may vary if tensor parallel is used + query_states = query_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, -1, self.head_dim).transpose(1, 2) + + if position_embeddings is None: + logger.warning_once( + "The attention layers in this model are transitioning from computing the RoPE embeddings internally " + "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed " + "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be " + "removed and `position_embeddings` will be mandatory." + ) + cos, sin = self.rotary_emb(value_states, position_ids) + else: + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask, + # Reference: https://github.com/pytorch/pytorch/issues/112577. + if query_states.device.type == "cuda" and causal_mask is not None: + query_states = query_states.contiguous() + key_states = key_states.contiguous() + value_states = value_states.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + is_causal = True if causal_mask is None and q_len > 1 else False + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=causal_mask, + dropout_p=self.attention_dropout if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, -1) + + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value + + +ARIA_TEXT_ATTENTION_CLASSES = { + "eager": AriaTextAttention, + "flash_attention_2": AriaTextFlashAttention2, + "sdpa": AriaTextSdpaAttention, +} + + +class AriaTextDecoderLayer(nn.Module): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = ARIA_TEXT_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) + self.mlp = AriaTextMoELayer(config) + self.input_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # will become mandatory in v4.46 + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): + attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1, + query_sequence_length, key_sequence_length)` if default attention is used. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + elif isinstance(module, nn.Conv2d): + module.weight.data.normal_(mean=0.0, std=std) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +ARIA_TEXT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AriaTextConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aria Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaPreTrainedModel(PreTrainedModel): + config_class = AriaTextConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["AriaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +ARIA_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare AriaText Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaTextModel(AriaTextPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AriaTextDecoderLayer`] + + Args: + config: AriaTextConfig + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = AriaTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + if getattr(config, "pretraining_tp", 1) != 1: + logger.warn("`pretraining_tp` is deprecated, please use `model.tensor_parallel` instead.") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> Union[Tuple, BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + causal_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache = layer_outputs[2 if output_attentions else 1] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type == "cuda" + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to plcae the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :] + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + num_logits_to_keep: int = 0, + **kwargs: Unpack[KwargsForCausalLM], + ) -> Union[Tuple, CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + num_logits_to_keep (`int`, *optional*): + Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AriaTextForCausalLM + + >>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class AriaCausalLMOutputWithPast(ModelOutput): + """ + Base class for Aria causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + num_logits_to_keep (`int`, *optional*, defaults to 0): + Calculate logits for the last `num_logits_to_keep` tokens, or all `input_ids` if `0`. + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_sdpa = False + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: torch.FloatTensor = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + pixel_mask: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> Union[Tuple, AriaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_index + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py new file mode 100644 index 00000000000000..78c6e08bdfd0e5 --- /dev/null +++ b/src/transformers/models/aria/modular_aria.py @@ -0,0 +1,1598 @@ +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...modeling_utils import PreTrainedModel +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + TensorType, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer +from ..llama.configuration_llama import LlamaConfig +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, +) +from ..llava.modeling_llava import LlavaCausalLMOutputWithPast +from ..llava_next.image_processing_llava_next import divide_to_patches, make_batched_images + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + from torch import nn + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the begining for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaTextConfig(LlamaConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + base_config_key = "text_config" + + def __init__( + self, + intermediate_size: int = 4096, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + pad_token_id=2, + **super_kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **super_kwargs) + self.intermediate_size = intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +class AriaTextRMSNorm(LlamaRMSNorm): + pass + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_batched_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded / 255.0, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + text_inputs = self.tokenizer( + prompt_strings, + **output_kwargs["text_kwargs"], + ) + + return BatchFeature(data={**text_inputs, **image_inputs}) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +class AriaSharedExpertsMLP(LlamaMLP): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(self) + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +class AriaTextDecoderLayer(LlamaDecoderLayer): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__(self) + self.mlp = AriaTextMoELayer(config) + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + elif isinstance(module, nn.Conv2d): + module.weight.data.normal_(mean=0.0, std=std) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + +class AriaPreTrainedModel(LlamaPreTrainedModel): + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +class AriaTextModel(LlamaModel): + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + self.post_init() + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + +class AriaCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + num_logits_to_keep (`int`, *optional*, defaults to 0): + Calculate logits for the last `num_logits_to_keep` tokens, or all `input_ids` if `0`. + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_sdpa = False + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def tie_weights(self): + return self.language_model.tie_weights() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: torch.FloatTensor = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: torch.LongTensor = None, + pixel_values: torch.FloatTensor = None, + pixel_mask: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + num_logits_to_keep: int = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> Union[Tuple, AriaCausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_index, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_index + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + num_logits_to_keep=num_logits_to_keep, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + num_logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + num_logits_to_keep=num_logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaConfig", + "AriaTextConfig", + "AriaImageProcessor", + "AriaProcessor", + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py new file mode 100644 index 00000000000000..2cfbd72a002061 --- /dev/null +++ b/src/transformers/models/aria/processing_aria.py @@ -0,0 +1,164 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import PreTokenizedInput, TextInput +from ...utils import TensorType +from ..auto import AutoTokenizer + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + text_inputs = self.tokenizer( + prompt_strings, + **output_kwargs["text_kwargs"], + ) + + return BatchFeature(data={**text_inputs, **image_inputs}) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["AriaProcessor"] diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index c1f2d689df7095..cc3a7d5baaeb49 100644 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertConfig"), ("align", "AlignConfig"), ("altclip", "AltCLIPConfig"), + ("aria", "AriaConfig"), + ("aria_text", "AriaTextConfig"), ("audio-spectrogram-transformer", "ASTConfig"), ("autoformer", "AutoformerConfig"), ("bark", "BarkConfig"), @@ -135,6 +137,7 @@ ("idefics", "IdeficsConfig"), ("idefics2", "Idefics2Config"), ("idefics3", "Idefics3Config"), + ("idefics3_vision", "Idefics3VisionConfig"), ("ijepa", "IJepaConfig"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), @@ -327,6 +330,8 @@ ("albert", "ALBERT"), ("align", "ALIGN"), ("altclip", "AltCLIP"), + ("aria", "Aria"), + ("aria_text", "AriaText"), ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"), ("autoformer", "Autoformer"), ("bark", "Bark"), @@ -441,6 +446,7 @@ ("idefics", "IDEFICS"), ("idefics2", "Idefics2"), ("idefics3", "Idefics3"), + ("idefics3_vision", "Idefics3VisionTransformer"), ("ijepa", "I-JEPA"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), @@ -687,6 +693,8 @@ ("clip_vision_model", "clip"), ("qwen2_audio_encoder", "qwen2_audio"), ("clip_text_model", "clip"), + ("aria_text", "aria"), + ("idefics3_vision", "idefics3"), ("siglip_vision_model", "siglip"), ("chinese_clip_vision_model", "chinese_clip"), ("rt_detr_resnet", "rt_detr"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index e19c8efd205552..a699314f858928 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -54,6 +54,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( [ ("align", ("EfficientNetImageProcessor",)), + ("aria", ("AriaImageProcessor")), ("beit", ("BeitImageProcessor",)), ("bit", ("BitImageProcessor",)), ("blip", ("BlipImageProcessor",)), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 7a7cd9d475884c..e8e4814e6a0f6a 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -35,6 +35,8 @@ ("albert", "AlbertModel"), ("align", "AlignModel"), ("altclip", "AltCLIPModel"), + ("aria", "AriaForConditionalGeneration"), + ("aria_text", "AriaTextModel"), ("audio-spectrogram-transformer", "ASTModel"), ("autoformer", "AutoformerModel"), ("bark", "BarkModel"), @@ -132,6 +134,7 @@ ("idefics", "IdeficsModel"), ("idefics2", "Idefics2Model"), ("idefics3", "Idefics3Model"), + ("idefics3_vision", "Idefics3VisionTransformer"), ("ijepa", "IJepaModel"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), @@ -464,6 +467,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( [ # Model for Causal LM mapping + ("aria_text", "AriaTextForCausalLM"), ("bart", "BartForCausalLM"), ("bert", "BertLMHeadModel"), ("bert-generation", "BertGenerationDecoder"), @@ -768,6 +772,7 @@ MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( [ + ("aria", "AriaForConditionalGeneration"), ("blip", "BlipForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"), ("chameleon", "ChameleonForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index c1f23bc1cb3f18..3e475b1be211fa 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -47,6 +47,7 @@ [ ("align", "AlignProcessor"), ("altclip", "AltCLIPProcessor"), + ("aria", "AriaProcessor"), ("bark", "BarkProcessor"), ("blip", "BlipProcessor"), ("blip-2", "Blip2Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index e246bf3094c9cb..3cc181ac87adc4 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -68,6 +68,7 @@ ), ), ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("bart", ("BartTokenizer", "BartTokenizerFast")), ( diff --git a/src/transformers/models/idefics3/__init__.py b/src/transformers/models/idefics3/__init__.py index 35b1df5c678439..cec07ca6f5e2d3 100644 --- a/src/transformers/models/idefics3/__init__.py +++ b/src/transformers/models/idefics3/__init__.py @@ -16,7 +16,7 @@ from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available -_import_structure = {"configuration_idefics3": ["Idefics3Config"]} +_import_structure = {"configuration_idefics3": ["Idefics3Config", "Idefics3VisionConfig"]} try: @@ -38,11 +38,12 @@ "Idefics3ForConditionalGeneration", "Idefics3PreTrainedModel", "Idefics3Model", + "Idefics3VisionTransformer", ] _import_structure["processing_idefics3"] = ["Idefics3Processor"] if TYPE_CHECKING: - from .configuration_idefics3 import Idefics3Config + from .configuration_idefics3 import Idefics3Config, Idefics3VisionConfig try: if not is_vision_available(): @@ -62,6 +63,7 @@ Idefics3ForConditionalGeneration, Idefics3Model, Idefics3PreTrainedModel, + Idefics3VisionTransformer, ) from .processing_idefics3 import Idefics3Processor diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index d770b83df935a5..8747939a62ce2a 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -685,6 +685,41 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class AriaForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class AriaTextPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class ASTForAudioClassification(metaclass=DummyObject): _backends = ["torch"] @@ -4978,6 +5013,20 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["torch"]) +class Idefics3VisionConfig(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Idefics3VisionTransformer(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + class IJepaForImageClassification(metaclass=DummyObject): _backends = ["torch"] diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d2ccaeaaed23a8..3ebda4404aae9c 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -23,6 +23,13 @@ def __init__(self, *args, **kwargs): requires_backends(self, ["vision"]) +class AriaImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class BeitFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 12faeb8da9256a..76ab793e3a36c0 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1727,6 +1727,7 @@ def test_generate_from_inputs_embeds_with_static_cache(self): num_hidden_layers = text_config.num_hidden_layers inputs_embeds = model.get_input_embeddings()(input_ids) + max_cache_len += inputs_embeds.shape[1] outputs = model.generate(inputs_embeds=inputs_embeds, **generation_kwargs, **inputs_dict) # we should get `max_length` in shape, not `max_length - embeds_length` diff --git a/tests/models/aria/__init__.py b/tests/models/aria/__init__.py new file mode 100644 index 00000000000000..e69de29bb2d1d6 diff --git a/tests/models/aria/test_image_processing_aria.py b/tests/models/aria/test_image_processing_aria.py new file mode 100644 index 00000000000000..8a0f84d34eefed --- /dev/null +++ b/tests/models/aria/test_image_processing_aria.py @@ -0,0 +1,268 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.image_utils import PILImageResampling +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin + + +if is_vision_available(): + from PIL import Image + + from transformers import AriaImageProcessor + + +if is_torch_available(): + import torch + + +class AriaImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + num_images=1, + min_resolution=30, + max_resolution=40, + size=None, + max_image_size=980, + min_image_size=336, + split_resolutions=None, + split_image=True, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_convert_rgb=True, + resample=PILImageResampling.BICUBIC, + ): + super().__init__() + self.size = size if size is not None else {"longest_edge": max_resolution} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.num_images = num_images + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.resample = resample + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.split_resolutions = split_resolutions if split_resolutions is not None else [[980, 980]] + self.split_image = split_image + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_convert_rgb = do_convert_rgb + + def prepare_image_processor_dict(self): + return { + "image_mean": self.image_mean, + "image_std": self.image_std, + "max_image_size": self.max_image_size, + "min_image_size": self.min_image_size, + "split_resolutions": self.split_resolutions, + "split_image": self.split_image, + "do_convert_rgb": self.do_convert_rgb, + "do_normalize": self.do_normalize, + "resample": self.resample, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to AriaImageProcessor, + assuming do_resize is set to True. The expected size in that case the max image size. + """ + return self.max_image_size, self.max_image_size + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def prepare_image_inputs( + self, + batch_size=None, + min_resolution=None, + max_resolution=None, + num_channels=None, + num_images=None, + size_divisor=None, + equal_resolution=False, + numpify=False, + torchify=False, + ): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + + One can specify whether the images are of the same resolution or not. + """ + assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" + + batch_size = batch_size if batch_size is not None else self.batch_size + min_resolution = min_resolution if min_resolution is not None else self.min_resolution + max_resolution = max_resolution if max_resolution is not None else self.max_resolution + num_channels = num_channels if num_channels is not None else self.num_channels + num_images = num_images if num_images is not None else self.num_images + + images_list = [] + for i in range(batch_size): + images = [] + for j in range(num_images): + if equal_resolution: + width = height = max_resolution + else: + # To avoid getting image width/height 0 + if size_divisor is not None: + # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor` + min_resolution = max(size_divisor, min_resolution) + width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) + images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8)) + images_list.append(images) + + if not numpify and not torchify: + # PIL expects the channel dimension as last dimension + images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list] + + if torchify: + images_list = [[torch.from_numpy(image) for image in images] for images in images_list] + + if numpify: + # Numpy images are typically in channels last format + images_list = [[image.transpose(1, 2, 0) for image in images] for images in images_list] + + return images_list + + +@require_torch +@require_vision +class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = AriaImageProcessor if is_vision_available() else None + + def setUp(self): + super().setUp() + self.image_processor_tester = AriaImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "max_image_size")) + self.assertTrue(hasattr(image_processing, "min_image_size")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "split_image")) + + def test_call_numpy(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_numpy_4_channels(self): + # Aria always processes images as RGB, so it always returns images with 3 channels + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processor_dict = self.image_processor_dict + image_processing = self.image_processing_class(**image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pil(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for images in image_inputs: + for image in images: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pytorch(self): + for image_processing_class in self.image_processor_list: + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for images in image_inputs: + for image in images: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) diff --git a/tests/models/aria/test_modeling_aria.py b/tests/models/aria/test_modeling_aria.py new file mode 100644 index 00000000000000..d3458530ac349e --- /dev/null +++ b/tests/models/aria/test_modeling_aria.py @@ -0,0 +1,669 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Aria model.""" + +import gc +import unittest + +import requests + +from transformers import ( + AriaConfig, + AriaForConditionalGeneration, + AriaTextConfig, + AutoProcessor, + AutoTokenizer, + is_torch_available, + is_vision_available, +) +from transformers.models.idefics3 import Idefics3VisionConfig +from transformers.testing_utils import ( + require_bitsandbytes, + require_torch, + require_torch_gpu, + require_vision, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch +else: + is_torch_greater_or_equal_than_2_0 = False + +if is_vision_available(): + from PIL import Image + + +class AriaVisionText2TextModelTester: + def __init__( + self, + parent, + ignore_index=-100, + image_token_index=9, + projector_hidden_act="gelu", + seq_length=7, + vision_feature_select_strategy="default", + vision_feature_layer=-1, + text_config=AriaTextConfig( + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + pad_token_id=1, + hidden_size=32, + intermediate_size=64, + max_position_embeddings=60, + model_type="aria_moe_lm", + moe_intermediate_size=4, + moe_num_experts=4, + moe_topk=2, + num_attention_heads=20, + num_experts_per_tok=3, + num_hidden_layers=2, + num_key_value_heads=20, + rope_theta=5000000, + vocab_size=99, + eos_token_id=2, + head_dim=2, + ), + is_training=True, + vision_config=Idefics3VisionConfig( + image_size=358, + patch_size=10, + num_channels=3, + is_training=True, + hidden_size=32, + projection_dim=20, + num_hidden_layers=2, + num_attention_heads=16, + intermediate_size=10, + dropout=0.1, + attention_dropout=0.1, + initializer_range=0.02, + ), + ): + self.parent = parent + self.ignore_index = ignore_index + self.image_token_index = image_token_index + self.projector_hidden_act = projector_hidden_act + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + self.text_config = text_config + self.vision_config = vision_config + self.pad_token_id = text_config.pad_token_id + self.eos_token_id = text_config.eos_token_id + self.num_hidden_layers = text_config.num_hidden_layers + self.vocab_size = text_config.vocab_size + self.hidden_size = text_config.hidden_size + self.num_attention_heads = text_config.num_attention_heads + self.is_training = is_training + + self.batch_size = 10 + self.num_channels = 3 + self.image_size = 358 + self.num_image_tokens = 128 + self.seq_length = seq_length + self.num_image_tokens + + def get_config(self): + return AriaConfig( + text_config=self.text_config, + vision_config=self.vision_config, + ignore_index=self.ignore_index, + image_token_index=self.image_token_index, + projector_hidden_act=self.projector_hidden_act, + vision_feature_select_strategy=self.vision_feature_select_strategy, + vision_feature_layer=self.vision_feature_layer, + eos_token_id=self.eos_token_id, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.vision_config.num_channels, + self.vision_config.image_size, + self.vision_config.image_size, + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1 + attention_mask = input_ids.ne(1).to(torch_device) + input_ids[input_ids == config.image_token_index] = self.pad_token_id + input_ids[:, : self.num_image_tokens] = config.image_token_index + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + def create_and_check_aria_model_fp16_forward(self, config, input_ids, pixel_values, attention_mask): + model = AriaForConditionalGeneration(config=config) + model.to(torch_device) + model.eval() + with torch.autocast(device_type="cuda", dtype=torch.float16): + logits = model( + input_ids=input_ids, + attention_mask=attention_mask, + pixel_values=pixel_values.to(torch.bfloat16), + return_dict=True, + )["logits"] + self.parent.assertFalse(torch.isnan(logits).any().item()) + + +@require_torch +class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase): + """ + Model tester for `AriaForConditionalGeneration`. + """ + + all_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else () + all_generative_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else () + test_pruning = False + test_head_masking = False + _is_composite = True + + def setUp(self): + self.model_tester = AriaVisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=AriaConfig, has_text_modality=False) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + def test_inputs_embeds(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + wte = model.get_input_embeddings() + inputs["inputs_embeds"] = wte(input_ids) + + with torch.no_grad(): + model(**inputs) + + # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs + # while some other models require pixel_values to be present + def test_inputs_embeds_matches_input_ids(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + + inputs = self._prepare_for_class(inputs_dict, model_class) + input_ids = inputs["input_ids"] + del inputs["input_ids"] + del inputs["pixel_values"] + + inputs_embeds = model.get_input_embeddings()(input_ids) + + with torch.no_grad(): + out_ids = model(input_ids=input_ids, **inputs)[0] + out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0] + self.assertTrue(torch.allclose(out_embeds, out_ids)) + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant(self): + pass + + @unittest.skip( + reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" + ) + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_compile_dynamic(self): + pass + + @unittest.skip(reason="Compile not yet supported because in LLava models") + def test_sdpa_can_dispatch_on_flash(self): + pass + + @unittest.skip(reason="") + def test_new_cache_format_0(self): + pass + + @unittest.skip(reason="") + def test_new_cache_format_1(self): + pass + + @unittest.skip(reason="") + def test_new_cache_format_2(self): + pass + + @unittest.skip(reason="Feedforward chunking is not yet supported") + def test_feed_forward_chunking(self): + pass + + @unittest.skip(reason="Unstable test") + def test_initialization(self): + pass + + @unittest.skip(reason="Unstable test") + def test_dola_decoding_sample(self): + pass + + @unittest.skip(reason="Unsupported") + def test_generate_from_inputs_embeds_0_greedy(self): + pass + + @unittest.skip(reason="Unsupported") + def test_generate_from_inputs_embeds_1_beam_search(self): + pass + + @unittest.skip(reason="Unsupported") + def test_generate_with_static_cache(self): + pass + + +@require_torch +class AriaForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("rhymes-ai/Aria") + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + @slow + @require_bitsandbytes + def test_small_model_integration_test(self): + # Let' s make sure we test the preprocessing to replace what is used + model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True) + + prompt = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:" + image_file = "https://aria-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt") + + EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip + self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS)) + + output = model.generate(**inputs, max_new_tokens=20) + EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip + + self.assertEqual( + self.processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_single(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "rhymes-ai/Aria" + + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:" + image_file = "https://aria-vl.github.io/static/images/view.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + output = model.generate(**inputs, max_new_tokens=900, do_sample=False) + EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip + + self.assertEqual( + processor.decode(output[0], skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "rhymes-ai/Aria" + + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:", + "USER: \nWhat is this? ASSISTANT:", + ] + image1 = Image.open(requests.get("https://aria-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER: \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_batch(self): + # Let' s make sure we test the preprocessing to replace what is used + model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True) + # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!. + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://aria-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = [ + 'USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, there are a few things to be cautious about and items to bring.', + 'USER: \nWhat is this?\nASSISTANT: Cats' + ] # fmt: skip + self.assertEqual( + self.processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_bitsandbytes + def test_small_model_integration_test_llama_batched_regression(self): + # Let' s make sure we test the preprocessing to replace what is used + model_id = "rhymes-ai/Aria" + + # Multi-image & multi-prompt (e.g. 3 images and 2 prompts now fails with SDPA, this tests if "eager" works as before) + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True, attn_implementation="eager") + processor = AutoProcessor.from_pretrained(model_id, pad_token="") + + prompts = [ + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", + "USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT:", + ] + image1 = Image.open(requests.get("https://aria-vl.github.io/static/images/view.jpg", stream=True).raw) + image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + + inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True) + + output = model.generate(**inputs, max_new_tokens=20) + + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: \nAnd this?\nASSISTANT: A cat sleeping on a bed.'] # fmt: skip + + self.assertEqual( + processor.batch_decode(output, skip_special_tokens=True), + EXPECTED_DECODED_TEXT, + ) + + @slow + @require_torch + @require_vision + def test_batched_generation(self): + model = AriaForConditionalGeneration.from_pretrained("rhymes-ai/Aria", load_in_4bit=True) + + processor = AutoProcessor.from_pretrained("rhymes-ai/Aria") + + prompt1 = "\n\nUSER: What's the the difference of two images?\nASSISTANT:" + prompt2 = "\nUSER: Describe the image.\nASSISTANT:" + prompt3 = "\nUSER: Describe the image.\nASSISTANT:" + url1 = "https://images.unsplash.com/photo-1552053831-71594a27632d?q=80&w=3062&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + url2 = "https://images.unsplash.com/photo-1617258683320-61900b281ced?q=80&w=3087&auto=format&fit=crop&ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D" + image1 = Image.open(requests.get(url1, stream=True).raw) + image2 = Image.open(requests.get(url2, stream=True).raw) + + inputs = processor( + images=[image1, image2, image1, image2], + text=[prompt1, prompt2, prompt3], + return_tensors="pt", + padding=True, + ).to(torch_device) + + model = model.eval() + + EXPECTED_OUTPUT = [ + "\n \nUSER: What's the the difference of two images?\nASSISTANT: The difference between the two images is that one shows a dog standing on a grassy field, while", + "\nUSER: Describe the image.\nASSISTANT: The image features a brown and white dog sitting on a sidewalk. The dog is holding a small", + "\nUSER: Describe the image.\nASSISTANT: The image features a lone llama standing on a grassy hill. The llama is the", + ] + + generate_ids = model.generate(**inputs, max_new_tokens=20) + outputs = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) + self.assertEqual(outputs, EXPECTED_OUTPUT) + + @slow + @require_bitsandbytes + def test_aria_index_error_bug(self): + # This is a reproducer of https://github.com/huggingface/transformers/pull/28032 and makes sure it does not happen anymore + # Please refer to that PR, or specifically https://github.com/huggingface/transformers/pull/28032#issuecomment-1860650043 for + # more details + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + + processor = AutoProcessor.from_pretrained(model_id) + + # Simulate a super long prompt + user_prompt = "Describe the image:?\n" * 200 + prompt = f"USER: \n{user_prompt}ASSISTANT:" + image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + + # Make sure that `generate` works + _ = model.generate(**inputs, max_new_tokens=20) + + @slow + @require_torch_gpu + def test_aria_merge_inputs_error_bug(self): + # This is a reproducer of https://github.com/huggingface/transformers/pull/28333 and makes sure it does not happen anymore + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + + # Simulate some user inputs + pixel_values = torch.randn( + (1, 3, 336, 336), + dtype=torch.float, + device=torch_device, + ) + input_ids = torch.tensor( + [ + [32001, 32001, 1, 15043, 7084, 32000, 29871, 13, 7900], + ], + dtype=torch.long, + device=torch_device, + ) + attention_mask = torch.tensor( + [[0, 0, 1, 1, 1, 1, 1, 1, 1]], + dtype=torch.long, + device=torch_device, + ) + + # Make sure that the loss is properly computed + loss = model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + labels=input_ids, + ).loss + loss.backward() + + def test_tokenizer_integration(self): + model_id = "rhymes-ai/Aria" + slow_tokenizer = AutoTokenizer.from_pretrained( + model_id, bos_token="<|startoftext|>", eos_token="<|endoftext|>", use_fast=False + ) + slow_tokenizer.add_tokens("", True) + + fast_tokenizer = AutoTokenizer.from_pretrained( + model_id, + bos_token="<|startoftext|>", + eos_token="<|endoftext|>", + from_slow=True, + legacy=False, + ) + fast_tokenizer.add_tokens("", True) + + prompt = "<|startoftext|><|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|>" + EXPECTED_OUTPUT = ['<|startoftext|>', '<', '|', 'im', '_', 'start', '|', '>', 'system', '\n', 'Answer', '▁the', '▁questions', '.<', '|', 'im', '_', 'end', '|', '><', '|', 'im', '_', 'start', '|', '>', 'user', '\n', '', '\n', 'What', '▁is', '▁shown', '▁in', '▁this', '▁image', '?', '<', '|', 'im', '_', 'end', '|', '>'] # fmt: skip + self.assertEqual(slow_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + self.assertEqual(fast_tokenizer.tokenize(prompt), EXPECTED_OUTPUT) + + @slow + @require_bitsandbytes + def test_generation_no_images(self): + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + # Prepare inputs with no images + inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device) + + # Make sure that `generate` works + _ = model.generate(**inputs, max_new_tokens=20) + + @slow + @require_bitsandbytes + def test_generation_siglip_backbone(self): + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id, torch_dtype="float16", device_map=torch_device) + processor = AutoProcessor.from_pretrained(model_id) + + # check processing with expansion of inputs (w/o expansion should work with any backbone) + processor.vision_feature_select_strategy = "default" + processor.patch_size = 14 + + image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + inputs = processor( + text="<|im_start|>user\n\nWhat are these?<|im_end|>\n<|im_start|>assistant", + images=raw_image, + return_tensors="pt", + ).to(torch_device, torch.float16) + + # Make sure that `generate` works + output = model.generate(**inputs, max_new_tokens=30) + + EXPECTED_DECODED_TEXT = "user\n\nWhat are these?\nassistant The image shows two cats, one on the left and one on the right. They appear to be resting or sleeping on a pink blanket. The cat" + self.assertTrue(processor.batch_decode(output, skip_special_tokens=True)[0] == EXPECTED_DECODED_TEXT) + + @slow + @require_bitsandbytes + def test_expansion_in_processing(self): + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True) + processor = AutoProcessor.from_pretrained(model_id) + + prompt = "USER: \nDescribe the image:\nASSISTANT:" + image_file = "http://images.cocodataset.org/val2017/000000039769.jpg" + raw_image = Image.open(requests.get(image_file, stream=True).raw) + + # check processing with expansion of inputs + processor.vision_feature_select_strategy = "default" + processor.patch_size = 14 + inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593) + + # check processing without expansion of inputs (legacy behavior) + processor.vision_feature_select_strategy = None + processor.patch_size = None + inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16) + self.assertTrue(inputs.input_ids.shape[-1] == 18) + + # generate exactly 20 tokens + output = model.generate(**inputs, min_new_tokens=20, max_new_tokens=20) + output_expanded = model.generate(**inputs_expanded, min_new_tokens=20, max_new_tokens=20) + + # check that both inputs are handled correctly and generate the same output + self.assertListEqual(output_expanded[:, -20:].tolist(), output[:, -20:].tolist()) + + @slow + @require_bitsandbytes + def test_pixtral(self): + model_id = "rhymes-ai/Aria" + model = AriaForConditionalGeneration.from_pretrained(model_id) + processor = AutoProcessor.from_pretrained(model_id) + + IMG_URLS = [ + Image.open(requests.get("https://picsum.photos/id/237/400/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/231/200/300", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/27/500/500", stream=True).raw), + Image.open(requests.get("https://picsum.photos/id/17/150/600", stream=True).raw), + ] + PROMPT = "[INST]Describe the images.\n[IMG][IMG][IMG][IMG][/INST]" + + # image = Image.open(requests.get(url, stream=True).raw) + inputs = processor(text=PROMPT, images=IMG_URLS, return_tensors="pt").to("cuda") + generate_ids = model.generate(**inputs, max_new_tokens=500) + ouptut = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + + # fmt: off + EXPECTED_GENERATION = """ +Describe the images. +Sure, let's break down each image description: + +1. **Image 1:** + - **Description:** A black dog with a glossy coat is sitting on a wooden floor. The dog has a focused expression and is looking directly at the camera. + - **Details:** The wooden floor has a rustic appearance with visible wood grain patterns. The dog's eyes are a striking color, possibly brown or amber, which contrasts with its black fur. + +2. **Image 2:** + - **Description:** A scenic view of a mountainous landscape with a winding road cutting through it. The road is surrounded by lush green vegetation and leads to a distant valley. + - **Details:** The mountains are rugged with steep slopes, and the sky is clear, indicating good weather. The winding road adds a sense of depth and perspective to the image. + +3. **Image 3:** + - **Description:** A beach scene with waves crashing against the shore. There are several people in the water and on the beach, enjoying the waves and the sunset. + - **Details:** The waves are powerful, creating a dynamic and lively atmosphere. The sky is painted with hues of orange and pink from the setting sun, adding a warm glow to the scene. + +4. **Image 4:** + - **Description:** A garden path leading to a large tree with a bench underneath it. The path is bordered by well-maintained grass and flowers. + - **Details:** The path is made of small stones or gravel, and the tree provides a shaded area with the bench invitingly placed beneath it. The surrounding area is lush and green, suggesting a well-kept garden. + +Each image captures a different scene, from a close-up of a dog to expansive natural landscapes, showcasing various elements of nature and human interaction with it. +""" + # fmt: on + # check that both inputs are handled correctly and generate the same output + self.assertListEqual(ouptut, EXPECTED_GENERATION) diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py new file mode 100644 index 00000000000000..7e23d861c775c0 --- /dev/null +++ b/tests/models/aria/test_processor_aria.py @@ -0,0 +1,391 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import shutil +import tempfile +import unittest +from io import BytesIO +from typing import Optional + +import numpy as np +import requests + +from transformers import AriaProcessor +from transformers.models.auto.processing_auto import AutoProcessor +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_vision_available + +from ...test_processing_common import ProcessorTesterMixin + + +if is_vision_available(): + from PIL import Image + + +@require_torch +@require_vision +class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase): + processor_class = AriaProcessor + + @classmethod + def setUpClass(cls): + cls.tmpdirname = tempfile.mkdtemp() + processor = AriaProcessor.from_pretrained("m-ric/Aria_hf_2", image_seq_len=2) + processor.save_pretrained(cls.tmpdirname) + cls.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + cls.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + cls.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + cls.bos_token = "<|im_start|>" + cls.eos_token = "<|im_end|>" + + cls.image_token = processor.tokenizer.image_token + cls.fake_image_token = "o" + cls.global_img_token = "<|img|>" + + cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) + cls.eos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.eos_token) + + cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) + cls.fake_image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.fake_image_token) + cls.global_img_tokens_id = processor.tokenizer(cls.global_img_token, add_special_tokens=False)["input_ids"] + cls.padding_token_id = processor.tokenizer.pad_token_id + cls.image_seq_len = 256 + + def get_tokenizer(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer + + def get_image_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor + + def get_processor(self, **kwargs): + return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs) + + @classmethod + def tearDownClass(cls): + shutil.rmtree(cls.tmpdirname) + + def test_kwargs_overrides_default_image_processor_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + processor_components = self.prepare_components() + processor_components["image_processor"] = self.get_component( + "image_processor", do_rescale=True, rescale_factor=1 + ) + processor_components["tokenizer"] = self.get_component("tokenizer", max_length=117, padding="max_length") + + processor = self.processor_class(**processor_components) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertLessEqual(inputs[self.images_input_name][0][0].mean(), 0) + + def test_process_interleaved_images_prompts_image_splitting(self): + processor = self.get_processor() + processor.image_processor.split_image = True + + # Test that a single image is processed correctly + inputs = processor(images=self.image1, text="Ok<|img|>", images_kwargs={"split_image": True}) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (2, 3, 980, 980)) + self.assertEqual(np.array(inputs["pixel_mask"]).shape, (2, 980, 980)) + + def test_process_interleaved_images_prompts_no_image_splitting(self): + processor = self.get_processor() + processor.image_processor.split_image = False + + # Test that a single image is processed correctly + inputs = processor(images=self.image1, text="Ok<|img|>") + image1_expected_size = (980, 980) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size)) + self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size)) + # fmt: on + + # Test a single sample with image and text + image_str = "<|img|>" + text_str = "In this image, we see" + text = image_str + text_str + inputs = processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = processor.tokenizer(text_str, add_special_tokens=False) + + expected_input_ids = [[self.image_token_id] * self.image_seq_len + tokenized_sentence["input_ids"]] + # self.assertEqual(len(inputs["input_ids"]), len(expected_input_ids)) + + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(np.array(inputs["pixel_values"]).shape, (1, 3, *image1_expected_size)) + self.assertEqual(np.array(inputs["pixel_mask"]).shape, (1, *image1_expected_size)) + # fmt: on + + # Test that batch is correctly processed + image_str = "<|img|>" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = processor.tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = processor.tokenizer(text_str_2, add_special_tokens=False) + + image_tokens = [self.image_token_id] * self.image_seq_len + expected_input_ids_1 = image_tokens + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = 2 * image_tokens + tokenized_sentence_2["input_ids"] + + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + + expected_attention_mask = [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * (len(expected_input_ids_2))] + + self.assertEqual( + inputs["attention_mask"], + expected_attention_mask + ) + self.assertEqual(np.array(inputs['pixel_values']).shape, (3, 3, 980, 980)) + self.assertEqual(np.array(inputs['pixel_mask']).shape, (3, 980, 980)) + # fmt: on + + def test_non_nested_images_with_batched_text(self): + processor = self.get_processor() + processor.image_processor.do_image_splitting = False + + image_str = "<|img|>" + text_str_1 = "In this image, we see" + text_str_2 = "In this image, we see" + + text = [ + image_str + text_str_1, + image_str + image_str + text_str_2, + ] + images = [self.image1, self.image2, self.image3] + + inputs = processor(text=text, images=images, padding=True) + + self.assertEqual(np.array(inputs["pixel_values"]).shape, (3, 3, 980, 980)) + self.assertEqual(np.array(inputs["pixel_mask"]).shape, (3, 980, 980)) + + def test_apply_chat_template(self): + # Message contains content which a mix of lists with images and image urls and string + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What do these images show?"}, + {"type": "image"}, + {"type": "image"}, + "What do these images show?", + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.", + } + ], + }, + {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]}, + ] + processor = self.get_processor() + # Make short sequence length to test that the fake tokens are added correctly + rendered = processor.apply_chat_template(messages, add_generation_prompt=True) + print(rendered) + + expected_rendered = """<|im_start|>user +What do these images show?<|img|><|img|><|im_end|> +<|im_start|>assistant +The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.<|im_end|> +<|im_start|>user +And who is that?<|im_end|> +<|im_start|>assistant +""" + self.assertEqual(rendered, expected_rendered) + + # Override as AriaProcessor needs image tokens in prompts + def prepare_text_inputs(self, batch_size: Optional[int] = None): + if batch_size is None: + return "lower newer <|img|>" + + if batch_size < 1: + raise ValueError("batch_size must be greater than 0") + + if batch_size == 1: + return ["lower newer <|img|>"] + return ["lower newer <|img|>", "<|img|> upper older longer string"] + ["<|img|> lower newer"] * ( + batch_size - 2 + ) + + # Override tests as inputs_ids padded dimension is the second one but not the last one + @require_vision + @require_torch + def test_kwargs_overrides_default_tokenizer_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=30) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt", max_length=30) + self.assertEqual(len(inputs["input_ids"][0]), 30) + + @require_torch + @require_vision + def test_structured_kwargs_nested(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + inputs = processor( + text=input_str, + images=image_input, + common_kwargs={"return_tensors": "pt"}, + images_kwargs={"max_image_size": 980}, + text_kwargs={"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, + ) + self.skip_processor_without_typed_kwargs(processor) + + self.assertEqual(inputs["pixel_values"].shape[3], 980) + + self.assertEqual(len(inputs["input_ids"][0]), 120) + + @require_torch + @require_vision + def test_structured_kwargs_nested_from_dict(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + + # Define the kwargs for each modality + all_kwargs = { + "common_kwargs": {"return_tensors": "pt"}, + "images_kwargs": {"max_image_size": 980}, + "text_kwargs": {"padding": "max_length", "max_length": 120, "truncation": "longest_first"}, + } + + inputs = processor(text=input_str, images=image_input, **all_kwargs) + self.assertEqual(inputs["pixel_values"].shape[3], 980) + self.assertEqual(len(inputs["input_ids"][0]), 120) + + @require_vision + @require_torch + def test_tokenizer_defaults_preserved_by_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer", max_length=30) + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input, return_tensors="pt") + self.assertEqual(len(inputs["input_ids"][0]), 30) + + @require_torch + @require_vision + def test_unstructured_kwargs_batched(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs(batch_size=2) + image_input = self.prepare_image_inputs(batch_size=2) + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + padding="longest", + max_length=76, + truncation=True, + max_image_size=980, + ) + + self.assertEqual(inputs["pixel_values"].shape[1], 3) + self.assertEqual(inputs["pixel_values"].shape[3], 980) + self.assertEqual(len(inputs["input_ids"][0]), 76) + + @require_torch + @require_vision + def test_unstructured_kwargs(self): + if "image_processor" not in self.processor_class.attributes: + self.skipTest(f"image_processor attribute not present in {self.processor_class}") + image_processor = self.get_component("image_processor") + tokenizer = self.get_component("tokenizer") + + processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor) + self.skip_processor_without_typed_kwargs(processor) + + input_str = self.prepare_text_inputs() + image_input = self.prepare_image_inputs() + inputs = processor( + text=input_str, + images=image_input, + return_tensors="pt", + max_image_size=980, + padding="max_length", + max_length=120, + truncation="longest_first", + ) + + self.assertEqual(inputs["pixel_values"].shape[3], 980) + self.assertEqual(len(inputs["input_ids"][0]), 120) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 9b8244c243fc4a..1c81c08fd845b1 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -381,7 +381,7 @@ def check_config_attributes_being_used(config_class): def check_config_attributes(): - """Check the arguments in `__init__` of all configuration classes are used in python files""" + """Check the arguments in `__init__` of all configuration classes are used in python files""" configs_with_unused_attributes = {} for _config_class in list(CONFIG_MAPPING.values()): # Skip deprecated models diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py index a2ea05edce8063..a63ca59690f748 100644 --- a/utils/check_docstrings.py +++ b/utils/check_docstrings.py @@ -865,9 +865,10 @@ def match_docstring_with_signature(obj: Any) -> Optional[Tuple[str, str]]: # We went too far by one (perhaps more if there are a lot of new lines) idx -= 1 - while len(obj_doc_lines[idx].strip()) == 0: - arguments[current_arg] = arguments[current_arg][:-1] - idx -= 1 + if current_arg: + while len(obj_doc_lines[idx].strip()) == 0: + arguments[current_arg] = arguments[current_arg][:-1] + idx -= 1 # And we went too far by one again. idx += 1 diff --git a/utils/check_repo.py b/utils/check_repo.py index 10be5cdcd26230..3dbe59f192293a 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -85,6 +85,8 @@ "Idefics2PerceiverResampler", "Idefics2VisionTransformer", "Idefics3VisionTransformer", + "AriaTextForCausalLM", + "AriaTextModel", ] # Update this list for models that are not tested with a comment explaining the reason it should not be. diff --git a/utils/modular_model_converter.py b/utils/modular_model_converter.py index cf1a0cfd95ca0a..e8d117cd2af08f 100644 --- a/utils/modular_model_converter.py +++ b/utils/modular_model_converter.py @@ -1678,7 +1678,7 @@ def save_modeling_file(modular_file, converted_file): parser = argparse.ArgumentParser() parser.add_argument( "--files_to_parse", - default=["src/transformers/models/gemma/modular_gemma.py"], + default=["src/transformers/models/aria/modular_aria.py"], nargs="+", help="A list of `modular_xxxx` files that should be converted to single model file", ) From 7f95372c6267d3163fd2aa74aeff9d84ddb6cc35 Mon Sep 17 00:00:00 2001 From: Victor Agostinelli <35327788+agostinv@users.noreply.github.com> Date: Fri, 6 Dec 2024 04:39:45 -0800 Subject: [PATCH 06/62] Add feature dim attributes to BitLinear for easier PEFT integration (#34946) Update bitnet.py, extremely small change to allow for easier PEFT integration Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> --- src/transformers/integrations/bitnet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/integrations/bitnet.py b/src/transformers/integrations/bitnet.py index 3386bdcb43b27c..0b50f9738afb69 100644 --- a/src/transformers/integrations/bitnet.py +++ b/src/transformers/integrations/bitnet.py @@ -127,6 +127,8 @@ class BitLinear(nn.Module): def __init__(self, in_features: int, out_features: int, bias: bool, device=None, dtype=None): super().__init__() self.dtype = dtype + self.in_features = in_features + self.out_features = out_features self.register_buffer( "weight", torch.zeros( From c8c8dffbe45ebef0a8dba4a51024e5e5e498596b Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Fri, 6 Dec 2024 13:42:51 +0000 Subject: [PATCH 07/62] Update I-JEPA checkpoints path (#35120) Update checkpoints path --- docs/source/en/model_doc/ijepa.md | 2 +- src/transformers/models/ijepa/modular_ijepa.py | 2 +- tests/models/ijepa/test_modeling_ijepa.py | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index 9a0cd368a8188f..32944e2617eae1 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -45,7 +45,7 @@ url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" image_1 = Image.open(requests.get(url_1, stream=True).raw) image_2 = Image.open(requests.get(url_2, stream=True).raw) -model_id = "jmtzt/ijepa_vith14_1k" +model_id = "facebook/ijepa_vith14_1k" processor = AutoProcessor.from_pretrained(model_id) model = AutoModel.from_pretrained(model_id) diff --git a/src/transformers/models/ijepa/modular_ijepa.py b/src/transformers/models/ijepa/modular_ijepa.py index efbd71d91342fd..3b3756dd5ce697 100644 --- a/src/transformers/models/ijepa/modular_ijepa.py +++ b/src/transformers/models/ijepa/modular_ijepa.py @@ -155,7 +155,7 @@ def __init__(self, config: IJepaConfig, add_pooling_layer: bool = False, use_mas self.embeddings = IJepaEmbeddings(config, use_mask_token=use_mask_token) -_IMAGE_CLASS_CHECKPOINT = "jmtzt/ijepa_vith14_1k" +_IMAGE_CLASS_CHECKPOINT = "facebook/ijepa_vith14_1k" _IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat" diff --git a/tests/models/ijepa/test_modeling_ijepa.py b/tests/models/ijepa/test_modeling_ijepa.py index 27a79bc6724285..723ddcf7988826 100644 --- a/tests/models/ijepa/test_modeling_ijepa.py +++ b/tests/models/ijepa/test_modeling_ijepa.py @@ -250,7 +250,7 @@ def test_for_image_classification(self): @slow def test_model_from_pretrained(self): - model_name = "jmtzt/ijepa_vith14_1k" + model_name = "facebook/ijepa_vith14_1k" model = IJepaModel.from_pretrained(model_name) self.assertIsNotNone(model) @@ -266,11 +266,11 @@ def prepare_img(): class IJepaModelIntegrationTest(unittest.TestCase): @cached_property def default_image_processor(self): - return ViTImageProcessor.from_pretrained("jmtzt/ijepa_vith14_1k") if is_vision_available() else None + return ViTImageProcessor.from_pretrained("facebook/ijepa_vith14_1k") if is_vision_available() else None @slow def test_inference_no_head(self): - model = IJepaModel.from_pretrained("jmtzt/ijepa_vith14_1k").to(torch_device) + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) image_processor = self.default_image_processor image = prepare_img() @@ -299,7 +299,7 @@ def test_inference_fp16(self): A small test to make sure that inference work in half precision without any problem. """ model = IJepaModel.from_pretrained( - "jmtzt/ijepa_vith14_1k", + "facebook/ijepa_vith14_1k", torch_dtype=torch.float16, device_map="auto", ) @@ -319,7 +319,7 @@ def test_inference_interpolate_pos_encoding(self): # allowing to interpolate the pre-trained position embeddings in order to use # the model on higher resolutions. The DINO model by Facebook AI leverages this # to visualize self-attention on higher resolution images. - model = IJepaModel.from_pretrained("jmtzt/ijepa_vith14_1k").to(torch_device) + model = IJepaModel.from_pretrained("facebook/ijepa_vith14_1k").to(torch_device) image_processor = self.default_image_processor image = prepare_img() From 1ccca8f48c493a4804921da66f65023e3f9c8d9c Mon Sep 17 00:00:00 2001 From: kang sheng Date: Mon, 9 Dec 2024 16:57:41 +0800 Subject: [PATCH 08/62] Fix GA loss bugs and add unit test (#35121) * fix GA bugs and add unit test * narrow down model loss unit test diff gap * format code to make ruff happy * send num_items_in_batch argument to decoder * fix GA loss bug in BertLMHeadModel * use TinyStories-33M to narrow down diff gap * fotmat code * missing .config * avoid add extra args --------- Co-authored-by: kangsheng --- src/transformers/models/bert/modeling_bert.py | 7 +- .../modeling_speech_encoder_decoder.py | 2 + src/transformers/trainer.py | 9 +- tests/trainer/test_trainer.py | 114 ++++++++++++++++-- 4 files changed, 108 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 6b05fa648158a6..e311f93b6c81ed 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1325,6 +1325,7 @@ def forward( output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, + **loss_kwargs, ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: r""" encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): @@ -1375,11 +1376,7 @@ def forward( lm_loss = None if labels is not None: - # we are doing next-token prediction; shift prediction scores and input ids by one - shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() - labels = labels[:, 1:].contiguous() - loss_fct = CrossEntropyLoss() - lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) if not return_dict: output = (prediction_scores,) + outputs[2:] diff --git a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py index 0d2b911bebe582..3bff8f6acd290d 100644 --- a/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/modeling_speech_encoder_decoder.py @@ -491,6 +491,8 @@ def forward( kwargs_decoder = { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } + if "num_items_in_batch" in kwargs_encoder: + kwargs_decoder["num_items_in_batch"] = kwargs_encoder.pop("num_items_in_batch", None) if encoder_outputs is None: if inputs is None: diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index af908e48e4b8c4..f7d79481809807 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3649,10 +3649,7 @@ def training_step( return loss_mb.reduce_mean().detach().to(self.args.device) with self.compute_loss_context_manager(): - if self.model_accepts_loss_kwargs: - loss = self.compute_loss(model, inputs) - else: - loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) del inputs if ( @@ -5132,10 +5129,6 @@ def get_batch_samples(self, epoch_iterator, num_batches): except StopIteration: break - # Keep default behavior the same - if not self.model_accepts_loss_kwargs: - return batch_samples, None - if len(batch_samples) > 0 and "labels" in batch_samples[0]: # For now we don't support object detection try: diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py index f7b4a8637bff85..d33be2789761da 100644 --- a/tests/trainer/test_trainer.py +++ b/tests/trainer/test_trainer.py @@ -750,11 +750,102 @@ def test_model_init(self): self.check_trained_model(trainer.model, alternate_seed=True) @slow - def test_gradient_accumulation_loss_alignment(self): + def test_gradient_accumulation_loss_alignment_with_model_loss(self): set_seed(42) import datasets - model_name = "distilgpt2" + model_name = "nickypro/tinyllama-110M" + dataset_name = "wikitext" + dataset_config = "wikitext-2-raw-v1" + dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:500]") + dataset = dataset.train_test_split(test_size=0.2) + tokenizer = AutoTokenizer.from_pretrained(model_name) + + tokenizer.pad_token = tokenizer.eos_token + + def tokenize_function(examples): + return tokenizer(examples["text"], max_length=128, padding="max_length", truncation=True) + + tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset["train"].column_names) + + data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + + model = AutoModelForCausalLM.from_pretrained(model_name) + + base_loss_callback = StoreLossCallback() + + args_kwargs = { + "report_to": "none", + "logging_steps": 1, + "max_steps": 20, + "learning_rate": 3e-4, + "disable_tqdm": True, + } + + args = TrainingArguments( + "./generation", + **args_kwargs, + ) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[base_loss_callback], + data_collator=data_collator, + ) + assert trainer.model_accepts_loss_kwargs + trainer.train() + + grad_accum_loss_callback = StoreLossCallback() + args = TrainingArguments( + "./generation", + **args_kwargs, + gradient_accumulation_steps=2, + per_device_train_batch_size=4, + ) + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[grad_accum_loss_callback], + data_collator=data_collator, + ) + trainer.train() + + set_seed(42) + model = AutoModelForCausalLM.from_pretrained(model_name) + broken_loss_callback = StoreLossCallback() + trainer = Trainer( + model, + args, + train_dataset=tokenized_dataset["train"], + callbacks=[broken_loss_callback], + data_collator=data_collator, + ) + # disable model_accepts_loss_kwargs + trainer.model_accepts_loss_kwargs = False + trainer.train() + + # Calculate the difference between the base loss and the grad_accum loss + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") + + @slow + def test_gradient_accumulation_loss_alignment_with_loss_func(self): + set_seed(42) + import datasets + + model_name = "roneneldan/TinyStories-33M" dataset_name = "wikitext" dataset_config = "wikitext-2-raw-v1" dataset = datasets.load_dataset(dataset_name, dataset_config, split="train[:500]") @@ -836,15 +927,16 @@ def compute_loss(logits, labels, vocab_size, num_items_in_batch, disable_num_ite trainer.train() # Calculate the difference between the base loss and the grad_accum loss - diff_truth = [base - grad for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses)] - diff_broken = [base - grad for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] - # These should be quite close - for diff in diff_truth: - self.assertLess(abs(diff), 0.1, f"Difference {diff} is not within 0.1") - - # These should be very off - for diff in diff_broken: - self.assertGreater(abs(diff), 0.1, f"Difference {diff} is not greater than 0.1") + diff_truth = [ + abs(base - grad) for base, grad in zip(base_loss_callback.losses, grad_accum_loss_callback.losses) + ] + diff_broken = [abs(base - grad) for base, grad in zip(base_loss_callback.losses, broken_loss_callback.losses)] + + # all diff truth should be quite close + self.assertLess(max(diff_truth), 0.01, f"Difference {max(diff_truth)} is not within 0.01") + + # max diff broken should be very off + self.assertGreater(max(diff_broken), 3, f"Difference {max(diff_broken)} is not greater than 3") def test_gradient_accumulation(self): # Training with half the batch size but accumulation steps as 2 should give the same training losses. From 9e420e02698f73a70ec1c99961f166c1b5df98bd Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Mon, 9 Dec 2024 10:01:31 +0100 Subject: [PATCH 09/62] [I-JEPA] Update docs (#35148) Update docs --- docs/source/en/model_doc/ijepa.md | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/ijepa.md b/docs/source/en/model_doc/ijepa.md index 32944e2617eae1..cb2afd25e20bca 100644 --- a/docs/source/en/model_doc/ijepa.md +++ b/docs/source/en/model_doc/ijepa.md @@ -18,13 +18,18 @@ rendered properly in your Markdown viewer. ## Overview -The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/pdf/2301.08243.pdf) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. +The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://arxiv.org/abs/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas. I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations. The abstract from the paper is the following: This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction. + + + I-JEPA architecture. Taken from the original paper. + This model was contributed by [jmtzt](https://huggingface.co/jmtzt). The original code can be found [here](https://github.com/facebookresearch/ijepa). @@ -63,6 +68,15 @@ similarity = cosine_similarity(embed_1, embed_2) print(similarity) ``` +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA. + + + +- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb). +- See also: [Image classification task guide](../tasks/image_classification) + ## IJepaConfig [[autodoc]] IJepaConfig @@ -75,4 +89,4 @@ print(similarity) ## IJepaForImageClassification [[autodoc]] IJepaForImageClassification - - forward + - forward \ No newline at end of file From 1452dc2514f14879d2662db06e2cd1227919bae9 Mon Sep 17 00:00:00 2001 From: UV Date: Mon, 9 Dec 2024 15:12:23 +0530 Subject: [PATCH 10/62] Corrected typo in agent system prompts (#35143) --- src/transformers/agents/prompts.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 7a84b1db44faba..898a7e011a2b05 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -129,7 +129,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` --- -Above example were using tools that might not exist for you. You only have acces to those Tools: +Above example were using tools that might not exist for you. You only have access to these Tools: <> Remember to make sure that variables you use are all defined. @@ -256,7 +256,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): } -Above example were using notional tools that might not exist for you. You only have acces to those tools: +Above example were using notional tools that might not exist for you. You only have access to these tools: <> Here are the rules you should always follow to solve your task: @@ -348,7 +348,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): final_answer(pope_current_age) ``` -Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have acces to those tools (and no other tool): +Above example were using notional tools that might not exist for you. On top of performing computations in the Python code snippets that you create, you have access to these tools (and no other tool): <> From de8a0b7547451f8f7ef2c0ac1f338ba77c614cec Mon Sep 17 00:00:00 2001 From: Daniel Bogdoll Date: Mon, 9 Dec 2024 05:29:04 -0500 Subject: [PATCH 11/62] Option to set 'non_blocking' for to(device) in BatchEncoding and BatchFeature (#34883) * Option to set 'non_blocking' for to(device) operation for performance improvements. Defaults to 'false', thus no behavioral changes. * Enabling non_blocking in to() operation of BatchFeature. * Improved docstring on utilization of non_blocking * Force non_blocking as keyword argument Co-authored-by: Pavel Iakubovskii --------- Co-authored-by: Daniel Bogdoll Co-authored-by: Pavel Iakubovskii --- src/transformers/feature_extraction_utils.py | 4 +++- src/transformers/tokenization_utils_base.py | 10 +++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/src/transformers/feature_extraction_utils.py b/src/transformers/feature_extraction_utils.py index f3cde8180c1bd4..6e8007edbc0b78 100644 --- a/src/transformers/feature_extraction_utils.py +++ b/src/transformers/feature_extraction_utils.py @@ -213,6 +213,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": Will be passed to the `to(...)` function of the tensors. kwargs (`Dict`, *optional*): Will be passed to the `to(...)` function of the tensors. + To enable asynchronous data transfer, set the `non_blocking` flag in `kwargs` (defaults to `False`). Returns: [`BatchFeature`]: The same instance after modification. @@ -222,6 +223,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": new_data = {} device = kwargs.get("device") + non_blocking = kwargs.get("non_blocking", False) # Check if the args are a device or a dtype if device is None and len(args) > 0: # device should be always the first argument @@ -241,7 +243,7 @@ def to(self, *args, **kwargs) -> "BatchFeature": # cast and send to device new_data[k] = v.to(*args, **kwargs) elif isinstance(v, torch.Tensor) and device is not None: - new_data[k] = v.to(device=device) + new_data[k] = v.to(device=device, non_blocking=non_blocking) else: new_data[k] = v self.data = new_data diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 0bfcc4aa303665..f4e5b9b3aaf314 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -799,12 +799,13 @@ def as_tensor(value, dtype=None): return self - def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": + def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding": """ - Send all values to device by calling `v.to(device)` (PyTorch only). + Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only). Args: device (`str` or `torch.device`): The device to put the tensors on. + non_blocking (`bool`): Whether to perform the copy asynchronously. Returns: [`BatchEncoding`]: The same instance after modification. @@ -816,7 +817,10 @@ def to(self, device: Union[str, "torch.device"]) -> "BatchEncoding": # Otherwise it passes the casts down and casts the LongTensor containing the token idxs # into a HalfTensor if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): - self.data = {k: v.to(device=device) if isinstance(v, torch.Tensor) else v for k, v in self.data.items()} + self.data = { + k: v.to(device=device, non_blocking=non_blocking) if isinstance(v, torch.Tensor) else v + for k, v in self.data.items() + } else: logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") return self From 7238387f67ab89c41502414e552c703a362d3bf5 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Mon, 9 Dec 2024 14:13:36 +0100 Subject: [PATCH 12/62] Fix typo in EETQ Tests (#35160) fix --- tests/quantization/eetq_integration/test_eetq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/quantization/eetq_integration/test_eetq.py b/tests/quantization/eetq_integration/test_eetq.py index 2c01f8145cba0e..f14fa076e4bb76 100644 --- a/tests/quantization/eetq_integration/test_eetq.py +++ b/tests/quantization/eetq_integration/test_eetq.py @@ -119,7 +119,7 @@ def test_quantized_model_conversion(self): self.assertEqual(nb_linears - 1, nb_eetq_linear) - # Try with `linear_weights_not_to_quantize` + # Try with `modules_to_not_convert` with init_empty_weights(): model = OPTForCausalLM(config) quantization_config = EetqConfig(modules_to_not_convert=["fc1"]) @@ -128,7 +128,7 @@ def test_quantized_model_conversion(self): for module in model.modules(): if isinstance(module, EetqLinear): nb_eetq_linear += 1 - + # 25 corresponds to the lm_head along with 24 fc1 layers. self.assertEqual(nb_linears - 25, nb_eetq_linear) def test_quantized_model(self): From 8e806a336f53af4603fcc6868e6abd9058fbb8fd Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Mon, 9 Dec 2024 16:09:50 +0100 Subject: [PATCH 13/62] Cleanup: continue the init refactor (#35167) Round 2 --- .../audio_spectrogram_transformer/__init__.py | 48 +---- ...iguration_audio_spectrogram_transformer.py | 3 + ...xtraction_audio_spectrogram_transformer.py | 3 + .../modeling_audio_spectrogram_transformer.py | 3 + src/transformers/models/bark/__init__.py | 65 +----- .../models/bark/configuration_bark.py | 3 + src/transformers/models/bark/modeling_bark.py | 10 + .../models/bark/processing_bark.py | 3 + src/transformers/models/bart/__init__.py | 138 ++----------- .../models/bart/configuration_bart.py | 3 + src/transformers/models/bart/modeling_bart.py | 12 ++ .../models/bart/modeling_flax_bart.py | 11 + .../models/bart/modeling_tf_bart.py | 3 + .../models/bart/tokenization_bart.py | 3 + .../models/bart/tokenization_bart_fast.py | 3 + src/transformers/models/barthez/__init__.py | 46 +---- .../models/barthez/tokenization_barthez.py | 3 + .../barthez/tokenization_barthez_fast.py | 3 + src/transformers/models/bartpho/__init__.py | 28 +-- .../models/bartpho/tokenization_bartpho.py | 3 + src/transformers/models/beit/__init__.py | 101 +--------- .../models/beit/configuration_beit.py | 3 + .../models/beit/feature_extraction_beit.py | 3 + .../models/beit/image_processing_beit.py | 3 + src/transformers/models/beit/modeling_beit.py | 10 + .../models/beit/modeling_flax_beit.py | 8 + src/transformers/models/bert/__init__.py | 189 ++---------------- .../models/bert/configuration_bert.py | 3 + src/transformers/models/bert/modeling_bert.py | 16 ++ .../models/bert/modeling_flax_bert.py | 14 ++ .../models/bert/modeling_tf_bert.py | 16 ++ .../models/bert/tokenization_bert.py | 3 + .../models/bert/tokenization_bert_fast.py | 3 + .../models/bert/tokenization_bert_tf.py | 3 + .../models/bert_generation/__init__.py | 59 +----- .../configuration_bert_generation.py | 3 + .../modeling_bert_generation.py | 8 + .../tokenization_bert_generation.py | 3 + .../models/bert_japanese/__init__.py | 13 +- .../tokenization_bert_japanese.py | 3 + src/transformers/models/bertweet/__init__.py | 13 +- .../models/bertweet/tokenization_bertweet.py | 3 + 42 files changed, 263 insertions(+), 611 deletions(-) diff --git a/src/transformers/models/audio_spectrogram_transformer/__init__.py b/src/transformers/models/audio_spectrogram_transformer/__init__.py index 9f1d65e1aac839..3fe10d60c03a92 100644 --- a/src/transformers/models/audio_spectrogram_transformer/__init__.py +++ b/src/transformers/models/audio_spectrogram_transformer/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,47 +13,17 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available - - -_import_structure = { - "configuration_audio_spectrogram_transformer": ["ASTConfig"], - "feature_extraction_audio_spectrogram_transformer": ["ASTFeatureExtractor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_audio_spectrogram_transformer"] = [ - "ASTForAudioClassification", - "ASTModel", - "ASTPreTrainedModel", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_audio_spectrogram_transformer import ( - ASTConfig, - ) - from .feature_extraction_audio_spectrogram_transformer import ASTFeatureExtractor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_audio_spectrogram_transformer import ( - ASTForAudioClassification, - ASTModel, - ASTPreTrainedModel, - ) - - + from .configuration_audio_spectrogram_transformer import * + from .convert_audio_spectrogram_transformer_original_to_pytorch import * + from .feature_extraction_audio_spectrogram_transformer import * + from .modeling_audio_spectrogram_transformer import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py index 7980667a68d7c5..77bec930236f60 100644 --- a/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -126,3 +126,6 @@ def __init__( # generative parameters deprecation cycle, overwriting this function prevents this from happening. def _get_non_default_generation_parameters(self) -> Dict[str, Any]: return {} + + +__all__ = ["ASTConfig"] diff --git a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py index 2bd122b4098c36..b181afe19e9ef8 100644 --- a/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -234,3 +234,6 @@ def __call__( padded_inputs = padded_inputs.convert_to_tensors(return_tensors) return padded_inputs + + +__all__ = ["ASTFeatureExtractor"] diff --git a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py index 491c6ce164611a..a9fe0d75f5c380 100644 --- a/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py +++ b/src/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -670,3 +670,6 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = ["ASTForAudioClassification", "ASTModel", "ASTPreTrainedModel"] diff --git a/src/transformers/models/bark/__init__.py b/src/transformers/models/bark/__init__.py index 4cb1a606cf6567..6c21cf99976a15 100644 --- a/src/transformers/models/bark/__init__.py +++ b/src/transformers/models/bark/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2023 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,63 +13,18 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bark": [ - "BarkCoarseConfig", - "BarkConfig", - "BarkFineConfig", - "BarkSemanticConfig", - ], - "processing_bark": ["BarkProcessor"], -} - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bark"] = [ - "BarkFineModel", - "BarkSemanticModel", - "BarkCoarseModel", - "BarkModel", - "BarkPreTrainedModel", - "BarkCausalModel", - ] - if TYPE_CHECKING: - from .configuration_bark import ( - BarkCoarseConfig, - BarkConfig, - BarkFineConfig, - BarkSemanticConfig, - ) - from .processing_bark import BarkProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bark import ( - BarkCausalModel, - BarkCoarseModel, - BarkFineModel, - BarkModel, - BarkPreTrainedModel, - BarkSemanticModel, - ) - + from .configuration_bark import * + from .convert_suno_to_hf import * + from .generation_configuration_bark import * + from .modeling_bark import * + from .processing_bark import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index a498d1dd19371d..932bad618aa187 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -298,3 +298,6 @@ def from_sub_model_configs( codec_config=codec_config.to_dict(), **kwargs, ) + + +__all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index f1c77367e5beb7..9e225ac9ae15c0 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1819,3 +1819,13 @@ def _check_and_enable_flash_attn_2( config.coarse_acoustics_config._attn_implementation = config._attn_implementation config.fine_acoustics_config._attn_implementation = config._attn_implementation return config + + +__all__ = [ + "BarkFineModel", + "BarkSemanticModel", + "BarkCoarseModel", + "BarkModel", + "BarkPreTrainedModel", + "BarkCausalModel", +] diff --git a/src/transformers/models/bark/processing_bark.py b/src/transformers/models/bark/processing_bark.py index 53715f3260422c..0bed6ca79f410b 100644 --- a/src/transformers/models/bark/processing_bark.py +++ b/src/transformers/models/bark/processing_bark.py @@ -285,3 +285,6 @@ def __call__( encoded_text["history_prompt"] = voice_preset return encoded_text + + +__all__ = ["BarkProcessor"] diff --git a/src/transformers/models/bart/__init__.py b/src/transformers/models/bart/__init__.py index d538fbb7d34304..11c3f4863f46a1 100644 --- a/src/transformers/models/bart/__init__.py +++ b/src/transformers/models/bart/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -13,134 +13,20 @@ # limitations under the License. from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = { - "configuration_bart": ["BartConfig", "BartOnnxConfig"], - "tokenization_bart": ["BartTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bart_fast"] = ["BartTokenizerFast"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bart"] = [ - "BartForCausalLM", - "BartForConditionalGeneration", - "BartForQuestionAnswering", - "BartForSequenceClassification", - "BartModel", - "BartPreTrainedModel", - "BartPretrainedModel", - "PretrainedBartModel", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bart"] = [ - "TFBartForConditionalGeneration", - "TFBartForSequenceClassification", - "TFBartModel", - "TFBartPretrainedModel", - ] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bart"] = [ - "FlaxBartDecoderPreTrainedModel", - "FlaxBartForCausalLM", - "FlaxBartForConditionalGeneration", - "FlaxBartForQuestionAnswering", - "FlaxBartForSequenceClassification", - "FlaxBartModel", - "FlaxBartPreTrainedModel", - ] - if TYPE_CHECKING: - from .configuration_bart import BartConfig, BartOnnxConfig - from .tokenization_bart import BartTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bart_fast import BartTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bart import ( - BartForCausalLM, - BartForConditionalGeneration, - BartForQuestionAnswering, - BartForSequenceClassification, - BartModel, - BartPreTrainedModel, - BartPretrainedModel, - PretrainedBartModel, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bart import ( - TFBartForConditionalGeneration, - TFBartForSequenceClassification, - TFBartModel, - TFBartPretrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bart import ( - FlaxBartDecoderPreTrainedModel, - FlaxBartForCausalLM, - FlaxBartForConditionalGeneration, - FlaxBartForQuestionAnswering, - FlaxBartForSequenceClassification, - FlaxBartModel, - FlaxBartPreTrainedModel, - ) - + from .configuration_bart import * + from .convert_bart_original_pytorch_checkpoint_to_pytorch import * + from .modeling_bart import * + from .modeling_flax_bart import * + from .modeling_tf_bart import * + from .tokenization_bart import * + from .tokenization_bart_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bart/configuration_bart.py b/src/transformers/models/bart/configuration_bart.py index a3bc7f38653a8a..4ce4316e3c0315 100644 --- a/src/transformers/models/bart/configuration_bart.py +++ b/src/transformers/models/bart/configuration_bart.py @@ -400,3 +400,6 @@ def _flatten_past_key_values_(self, flattened_output, name, idx, t): flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( flattened_output, name, idx, t ) + + +__all__ = ["BartConfig", "BartOnnxConfig"] diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py index 07c1fa622ea3b6..dd1b69c8127fb8 100755 --- a/src/transformers/models/bart/modeling_bart.py +++ b/src/transformers/models/bart/modeling_bart.py @@ -2158,3 +2158,15 @@ def _reorder_cache(past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPreTrainedModel", + "BartPretrainedModel", + "PretrainedBartModel", +] diff --git a/src/transformers/models/bart/modeling_flax_bart.py b/src/transformers/models/bart/modeling_flax_bart.py index 634c256fe7d81d..b346eaa39fc199 100644 --- a/src/transformers/models/bart/modeling_flax_bart.py +++ b/src/transformers/models/bart/modeling_flax_bart.py @@ -1993,3 +1993,14 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBartDecoderPreTrainedModel", + "FlaxBartForCausalLM", + "FlaxBartForConditionalGeneration", + "FlaxBartForQuestionAnswering", + "FlaxBartForSequenceClassification", + "FlaxBartModel", + "FlaxBartPreTrainedModel", +] diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py index 5ebde8cba60c45..7ab9817986e6ad 100644 --- a/src/transformers/models/bart/modeling_tf_bart.py +++ b/src/transformers/models/bart/modeling_tf_bart.py @@ -1709,3 +1709,6 @@ def build(self, input_shape=None): if getattr(self, "classification_head", None) is not None: with tf.name_scope(self.classification_head.name): self.classification_head.build(None) + + +__all__ = ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"] diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 5207b9c92b07ff..4c516cb81be0d2 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -388,3 +388,6 @@ def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): text = " " + text return (text, kwargs) + + +__all__ = ["BartTokenizer"] diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index e9fb8497c907b9..4586ab4797e5ec 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -274,3 +274,6 @@ def create_token_type_ids_from_sequences( if token_ids_1 is None: return len(cls + token_ids_0 + sep) * [0] return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BartTokenizerFast"] diff --git a/src/transformers/models/barthez/__init__.py b/src/transformers/models/barthez/__init__.py index 084cd22bdf1d88..323fe2fe8af982 100644 --- a/src/transformers/models/barthez/__init__.py +++ b/src/transformers/models/barthez/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,49 +11,17 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_tokenizers_available - - -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez"] = ["BarthezTokenizer"] - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_barthez_fast"] = ["BarthezTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez import BarthezTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_barthez_fast import BarthezTokenizerFast - + from .tokenization_barthez import * + from .tokenization_barthez_fast import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 46decddb3e10ba..604f9c7c21519a 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -284,3 +284,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizer"] diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index df8cc7757e96c0..a1d95ef03e4882 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -192,3 +192,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = copyfile(self.vocab_file, out_vocab_file) return (out_vocab_file,) + + +__all__ = ["BarthezTokenizerFast"] diff --git a/src/transformers/models/bartpho/__init__.py b/src/transformers/models/bartpho/__init__.py index c20d7370c6566c..597be95d8175ca 100644 --- a/src/transformers/models/bartpho/__init__.py +++ b/src/transformers/models/bartpho/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,32 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -_import_structure = {} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bartpho"] = ["BartphoTokenizer"] if TYPE_CHECKING: - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bartpho import BartphoTokenizer - + from .tokenization_bartpho import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bartpho/tokenization_bartpho.py b/src/transformers/models/bartpho/tokenization_bartpho.py index df121f26e255f4..e6e4f889842e8f 100644 --- a/src/transformers/models/bartpho/tokenization_bartpho.py +++ b/src/transformers/models/bartpho/tokenization_bartpho.py @@ -311,3 +311,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fp.write(f"{str(token)} \n") return out_vocab_file, out_monolingual_vocab_file + + +__all__ = ["BartphoTokenizer"] diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py index c2f49240d6e64c..0fc8919c7ea19a 100644 --- a/src/transformers/models/beit/__init__.py +++ b/src/transformers/models/beit/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2021 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,100 +11,21 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_torch_available, - is_vision_available, -) - - -_import_structure = {"configuration_beit": ["BeitConfig", "BeitOnnxConfig"]} - -try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["feature_extraction_beit"] = ["BeitFeatureExtractor"] - _import_structure["image_processing_beit"] = ["BeitImageProcessor"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_beit"] = [ - "BeitForImageClassification", - "BeitForMaskedImageModeling", - "BeitForSemanticSegmentation", - "BeitModel", - "BeitPreTrainedModel", - "BeitBackbone", - ] - +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_beit"] = [ - "FlaxBeitForImageClassification", - "FlaxBeitForMaskedImageModeling", - "FlaxBeitModel", - "FlaxBeitPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_beit import BeitConfig, BeitOnnxConfig - - try: - if not is_vision_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .feature_extraction_beit import BeitFeatureExtractor - from .image_processing_beit import BeitImageProcessor - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_beit import ( - BeitBackbone, - BeitForImageClassification, - BeitForMaskedImageModeling, - BeitForSemanticSegmentation, - BeitModel, - BeitPreTrainedModel, - ) - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_beit import ( - FlaxBeitForImageClassification, - FlaxBeitForMaskedImageModeling, - FlaxBeitModel, - FlaxBeitPreTrainedModel, - ) - - + from .configuration_beit import * + from .convert_beit_unilm_to_pytorch import * + from .feature_extraction_beit import * + from .image_processing_beit import * + from .modeling_beit import * + from .modeling_flax_beit import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py index f0f3c2582c35cc..834988258c6b75 100644 --- a/src/transformers/models/beit/configuration_beit.py +++ b/src/transformers/models/beit/configuration_beit.py @@ -224,3 +224,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: @property def atol_for_validation(self) -> float: return 1e-4 + + +__all__ = ["BeitConfig", "BeitOnnxConfig"] diff --git a/src/transformers/models/beit/feature_extraction_beit.py b/src/transformers/models/beit/feature_extraction_beit.py index 59dacb4ae51f6e..141d8bc36d2bbb 100644 --- a/src/transformers/models/beit/feature_extraction_beit.py +++ b/src/transformers/models/beit/feature_extraction_beit.py @@ -31,3 +31,6 @@ def __init__(self, *args, **kwargs) -> None: FutureWarning, ) super().__init__(*args, **kwargs) + + +__all__ = ["BeitFeatureExtractor"] diff --git a/src/transformers/models/beit/image_processing_beit.py b/src/transformers/models/beit/image_processing_beit.py index 7398381b2229bf..af76dd2e9656cb 100644 --- a/src/transformers/models/beit/image_processing_beit.py +++ b/src/transformers/models/beit/image_processing_beit.py @@ -510,3 +510,6 @@ def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] return semantic_segmentation + + +__all__ = ["BeitImageProcessor"] diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py index f972e021f3e2b3..01c16ca2cf000b 100755 --- a/src/transformers/models/beit/modeling_beit.py +++ b/src/transformers/models/beit/modeling_beit.py @@ -1576,3 +1576,13 @@ def forward( hidden_states=outputs.hidden_states if output_hidden_states else None, attentions=outputs.attentions, ) + + +__all__ = [ + "BeitForImageClassification", + "BeitForMaskedImageModeling", + "BeitForSemanticSegmentation", + "BeitModel", + "BeitPreTrainedModel", + "BeitBackbone", +] diff --git a/src/transformers/models/beit/modeling_flax_beit.py b/src/transformers/models/beit/modeling_flax_beit.py index c1da64d263a266..2d79c1820088a1 100644 --- a/src/transformers/models/beit/modeling_flax_beit.py +++ b/src/transformers/models/beit/modeling_flax_beit.py @@ -946,3 +946,11 @@ class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel): append_replace_return_docstrings( FlaxBeitForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=BeitConfig ) + + +__all__ = [ + "FlaxBeitForImageClassification", + "FlaxBeitForMaskedImageModeling", + "FlaxBeitModel", + "FlaxBeitPreTrainedModel", +] diff --git a/src/transformers/models/bert/__init__.py b/src/transformers/models/bert/__init__.py index 17048a5d1c967a..3ed12a889321e6 100644 --- a/src/transformers/models/bert/__init__.py +++ b/src/transformers/models/bert/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,183 +11,26 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import ( - OptionalDependencyNotAvailable, - _LazyModule, - is_flax_available, - is_tensorflow_text_available, - is_tf_available, - is_tokenizers_available, - is_torch_available, -) - - -_import_structure = { - "configuration_bert": ["BertConfig", "BertOnnxConfig"], - "tokenization_bert": ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"], -} - -try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_fast"] = ["BertTokenizerFast"] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert"] = [ - "BertForMaskedLM", - "BertForMultipleChoice", - "BertForNextSentencePrediction", - "BertForPreTraining", - "BertForQuestionAnswering", - "BertForSequenceClassification", - "BertForTokenClassification", - "BertLayer", - "BertLMHeadModel", - "BertModel", - "BertPreTrainedModel", - "load_tf_weights_in_bert", - ] - -try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_tf_bert"] = [ - "TFBertEmbeddings", - "TFBertForMaskedLM", - "TFBertForMultipleChoice", - "TFBertForNextSentencePrediction", - "TFBertForPreTraining", - "TFBertForQuestionAnswering", - "TFBertForSequenceClassification", - "TFBertForTokenClassification", - "TFBertLMHeadModel", - "TFBertMainLayer", - "TFBertModel", - "TFBertPreTrainedModel", - ] -try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_tf"] = ["TFBertTokenizer"] - -try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_flax_bert"] = [ - "FlaxBertForCausalLM", - "FlaxBertForMaskedLM", - "FlaxBertForMultipleChoice", - "FlaxBertForNextSentencePrediction", - "FlaxBertForPreTraining", - "FlaxBertForQuestionAnswering", - "FlaxBertForSequenceClassification", - "FlaxBertForTokenClassification", - "FlaxBertModel", - "FlaxBertPreTrainedModel", - ] if TYPE_CHECKING: - from .configuration_bert import BertConfig, BertOnnxConfig - from .tokenization_bert import BasicTokenizer, BertTokenizer, WordpieceTokenizer - - try: - if not is_tokenizers_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_fast import BertTokenizerFast - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert import ( - BertForMaskedLM, - BertForMultipleChoice, - BertForNextSentencePrediction, - BertForPreTraining, - BertForQuestionAnswering, - BertForSequenceClassification, - BertForTokenClassification, - BertLayer, - BertLMHeadModel, - BertModel, - BertPreTrainedModel, - load_tf_weights_in_bert, - ) - - try: - if not is_tf_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_tf_bert import ( - TFBertEmbeddings, - TFBertForMaskedLM, - TFBertForMultipleChoice, - TFBertForNextSentencePrediction, - TFBertForPreTraining, - TFBertForQuestionAnswering, - TFBertForSequenceClassification, - TFBertForTokenClassification, - TFBertLMHeadModel, - TFBertMainLayer, - TFBertModel, - TFBertPreTrainedModel, - ) - - try: - if not is_tensorflow_text_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_tf import TFBertTokenizer - - try: - if not is_flax_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_flax_bert import ( - FlaxBertForCausalLM, - FlaxBertForMaskedLM, - FlaxBertForMultipleChoice, - FlaxBertForNextSentencePrediction, - FlaxBertForPreTraining, - FlaxBertForQuestionAnswering, - FlaxBertForSequenceClassification, - FlaxBertForTokenClassification, - FlaxBertModel, - FlaxBertPreTrainedModel, - ) - + from .configuration_bert import * + from .convert_bert_original_tf2_checkpoint_to_pytorch import * + from .convert_bert_original_tf_checkpoint_to_pytorch import * + from .convert_bert_pytorch_checkpoint_to_original_tf import * + from .convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch import * + from .modeling_bert import * + from .modeling_flax_bert import * + from .modeling_tf_bert import * + from .tokenization_bert import * + from .tokenization_bert_fast import * + from .tokenization_bert_tf import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert/configuration_bert.py b/src/transformers/models/bert/configuration_bert.py index 613cf6a11463c2..ea29fb81c435aa 100644 --- a/src/transformers/models/bert/configuration_bert.py +++ b/src/transformers/models/bert/configuration_bert.py @@ -149,3 +149,6 @@ def inputs(self) -> Mapping[str, Mapping[int, str]]: ("token_type_ids", dynamic_axis), ] ) + + +__all__ = ["BertConfig", "BertOnnxConfig"] diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index e311f93b6c81ed..0c53963cee7922 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -1991,3 +1991,19 @@ def forward( hidden_states=outputs.hidden_states, attentions=outputs.attentions, ) + + +__all__ = [ + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + "load_tf_weights_in_bert", +] diff --git a/src/transformers/models/bert/modeling_flax_bert.py b/src/transformers/models/bert/modeling_flax_bert.py index 772ea2bf12b2ee..83358c86bd280d 100644 --- a/src/transformers/models/bert/modeling_flax_bert.py +++ b/src/transformers/models/bert/modeling_flax_bert.py @@ -1711,3 +1711,17 @@ def update_inputs_for_generation(self, model_outputs, model_kwargs): FlaxCausalLMOutputWithCrossAttentions, _CONFIG_FOR_DOC, ) + + +__all__ = [ + "FlaxBertForCausalLM", + "FlaxBertForMaskedLM", + "FlaxBertForMultipleChoice", + "FlaxBertForNextSentencePrediction", + "FlaxBertForPreTraining", + "FlaxBertForQuestionAnswering", + "FlaxBertForSequenceClassification", + "FlaxBertForTokenClassification", + "FlaxBertModel", + "FlaxBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/modeling_tf_bert.py b/src/transformers/models/bert/modeling_tf_bert.py index bb3281278adaa1..ce862194dc7787 100644 --- a/src/transformers/models/bert/modeling_tf_bert.py +++ b/src/transformers/models/bert/modeling_tf_bert.py @@ -2108,3 +2108,19 @@ def build(self, input_shape=None): if getattr(self, "qa_outputs", None) is not None: with tf.name_scope(self.qa_outputs.name): self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFBertEmbeddings", + "TFBertForMaskedLM", + "TFBertForMultipleChoice", + "TFBertForNextSentencePrediction", + "TFBertForPreTraining", + "TFBertForQuestionAnswering", + "TFBertForSequenceClassification", + "TFBertForTokenClassification", + "TFBertLMHeadModel", + "TFBertMainLayer", + "TFBertModel", + "TFBertPreTrainedModel", +] diff --git a/src/transformers/models/bert/tokenization_bert.py b/src/transformers/models/bert/tokenization_bert.py index 07583b949661de..42d4dd94554d41 100644 --- a/src/transformers/models/bert/tokenization_bert.py +++ b/src/transformers/models/bert/tokenization_bert.py @@ -502,3 +502,6 @@ def tokenize(self, text): else: output_tokens.extend(sub_tokens) return output_tokens + + +__all__ = ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"] diff --git a/src/transformers/models/bert/tokenization_bert_fast.py b/src/transformers/models/bert/tokenization_bert_fast.py index f4897772847029..4a89e6053b988f 100644 --- a/src/transformers/models/bert/tokenization_bert_fast.py +++ b/src/transformers/models/bert/tokenization_bert_fast.py @@ -170,3 +170,6 @@ def create_token_type_ids_from_sequences( def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: files = self._tokenizer.model.save(save_directory, name=filename_prefix) return tuple(files) + + +__all__ = ["BertTokenizerFast"] diff --git a/src/transformers/models/bert/tokenization_bert_tf.py b/src/transformers/models/bert/tokenization_bert_tf.py index ebf88eeac9bbe8..b1f49722fbdffa 100644 --- a/src/transformers/models/bert/tokenization_bert_tf.py +++ b/src/transformers/models/bert/tokenization_bert_tf.py @@ -252,3 +252,6 @@ def get_config(self): "sep_token_id": self.sep_token_id, "pad_token_id": self.pad_token_id, } + + +__all__ = ["TFBertTokenizer"] diff --git a/src/transformers/models/bert_generation/__init__.py b/src/transformers/models/bert_generation/__init__.py index 14cf8bb5879320..3f83b1f6e5bba3 100644 --- a/src/transformers/models/bert_generation/__init__.py +++ b/src/transformers/models/bert_generation/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,61 +11,18 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING -from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_sentencepiece_available, is_torch_available - - -_import_structure = {"configuration_bert_generation": ["BertGenerationConfig"]} - -try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["tokenization_bert_generation"] = ["BertGenerationTokenizer"] - -try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() -except OptionalDependencyNotAvailable: - pass -else: - _import_structure["modeling_bert_generation"] = [ - "BertGenerationDecoder", - "BertGenerationEncoder", - "BertGenerationPreTrainedModel", - "load_tf_weights_in_bert_generation", - ] +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .configuration_bert_generation import BertGenerationConfig - - try: - if not is_sentencepiece_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .tokenization_bert_generation import BertGenerationTokenizer - - try: - if not is_torch_available(): - raise OptionalDependencyNotAvailable() - except OptionalDependencyNotAvailable: - pass - else: - from .modeling_bert_generation import ( - BertGenerationDecoder, - BertGenerationEncoder, - BertGenerationPreTrainedModel, - load_tf_weights_in_bert_generation, - ) - + from .configuration_bert_generation import * + from .modeling_bert_generation import * + from .tokenization_bert_generation import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_generation/configuration_bert_generation.py b/src/transformers/models/bert_generation/configuration_bert_generation.py index d1d1b51b6538e2..1abe7c1a1c44ab 100644 --- a/src/transformers/models/bert_generation/configuration_bert_generation.py +++ b/src/transformers/models/bert_generation/configuration_bert_generation.py @@ -122,3 +122,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.position_embedding_type = position_embedding_type self.use_cache = use_cache + + +__all__ = ["BertGenerationConfig"] diff --git a/src/transformers/models/bert_generation/modeling_bert_generation.py b/src/transformers/models/bert_generation/modeling_bert_generation.py index 800ea2bef1d631..aaf326aa2de8eb 100755 --- a/src/transformers/models/bert_generation/modeling_bert_generation.py +++ b/src/transformers/models/bert_generation/modeling_bert_generation.py @@ -996,3 +996,11 @@ def _reorder_cache(self, past_key_values, beam_idx): tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), ) return reordered_past + + +__all__ = [ + "BertGenerationDecoder", + "BertGenerationEncoder", + "BertGenerationPreTrainedModel", + "load_tf_weights_in_bert_generation", +] diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index b1adb9b62b2551..31f046863c289c 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -170,3 +170,6 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = fi.write(content_spiece_model) return (out_vocab_file,) + + +__all__ = ["BertGenerationTokenizer"] diff --git a/src/transformers/models/bert_japanese/__init__.py b/src/transformers/models/bert_japanese/__init__.py index a569c3cc54bff8..f5296087db1d00 100644 --- a/src/transformers/models/bert_japanese/__init__.py +++ b/src/transformers/models/bert_japanese/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer - + from .tokenization_bert_japanese import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py index 732e9e7aff5741..8a841a3091623d 100644 --- a/src/transformers/models/bert_japanese/tokenization_bert_japanese.py +++ b/src/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -977,3 +977,6 @@ def tokenize(self, text): new_pieces.append(piece) return new_pieces + + +__all__ = ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"] diff --git a/src/transformers/models/bertweet/__init__.py b/src/transformers/models/bertweet/__init__.py index 42e4a23337c20c..432622f1595d1a 100644 --- a/src/transformers/models/bertweet/__init__.py +++ b/src/transformers/models/bertweet/__init__.py @@ -1,4 +1,4 @@ -# Copyright 2020 The HuggingFace Team. All rights reserved. +# Copyright 2024 The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,19 +11,16 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from typing import TYPE_CHECKING from ...utils import _LazyModule - - -_import_structure = {"tokenization_bertweet": ["BertweetTokenizer"]} +from ...utils.import_utils import define_import_structure if TYPE_CHECKING: - from .tokenization_bertweet import BertweetTokenizer - + from .tokenization_bertweet import * else: import sys - sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/src/transformers/models/bertweet/tokenization_bertweet.py b/src/transformers/models/bertweet/tokenization_bertweet.py index f478dd0832b6e4..499238e5955fe0 100644 --- a/src/transformers/models/bertweet/tokenization_bertweet.py +++ b/src/transformers/models/bertweet/tokenization_bertweet.py @@ -764,3 +764,6 @@ def casual_tokenize(text, preserve_case=True, reduce_len=False, strip_handles=Fa ############################################################################### + + +__all__ = ["BertweetTokenizer"] From 4bc39de5c34d4ffed42af4ddbf7bce0ed8a8bba6 Mon Sep 17 00:00:00 2001 From: fzyzcjy <5236035+fzyzcjy@users.noreply.github.com> Date: Tue, 10 Dec 2024 00:31:32 +0800 Subject: [PATCH 14/62] Super tiny fix logging message (#35132) Update integration_utils.py --- src/transformers/integrations/integration_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 0cc2685a55206f..e1f3bccb842fd1 100755 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1235,7 +1235,7 @@ def setup(self, args, state, model): logger.debug( f"MLflow experiment_name={self._experiment_name}, run_name={args.run_name}, nested={self._nested_run}," - f" tags={self._nested_run}, tracking_uri={self._tracking_uri}" + f" tracking_uri={self._tracking_uri}" ) if state.is_world_process_zero: if not self._ml_flow.is_tracking_uri_set(): From fa8763ce172fff0ae38280bf8cd7428c4921e0ed Mon Sep 17 00:00:00 2001 From: UV Date: Mon, 9 Dec 2024 22:10:32 +0530 Subject: [PATCH 15/62] Fixed typo of 'avilable' in prompts.py (#35145) --- src/transformers/agents/prompts.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/agents/prompts.py b/src/transformers/agents/prompts.py index 898a7e011a2b05..0cf8beb144f8ba 100644 --- a/src/transformers/agents/prompts.py +++ b/src/transformers/agents/prompts.py @@ -395,7 +395,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): SYSTEM_PROMPT_PLAN = """You are a world expert at making efficient plans to solve any task using a set of carefully crafted tools. Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. After writing the final step of the plan, write the '\n' tag and stop there.""" @@ -466,7 +466,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): ``` Now for the given task, develop a step-by-step high-level plan taking into account the above inputs and list of facts. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Beware that you have {remaining_steps} steps remaining. Do not skip steps, do not add any superfluous steps. Only write the high-level plan, DO NOT DETAIL INDIVIDUAL TOOL CALLS. After writing the final step of the plan, write the '\n' tag and stop there. @@ -474,7 +474,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): Now write your new plan below.""" SYSTEM_PROMPT_PLAN_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: Step #n: { "description": "tool": , @@ -620,7 +620,7 @@ def download_prompt(prompt_or_repo_id, agent_name, mode="run"): After writing the final step of the plan, write the '\n' tag and stop there. Output the plan only and nothing else.""" SYSTEM_PROMPT_PLAN_UPDATE_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools. -This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: +This plan should involve individual tasks based on the available tools, that if executed correctly will yield the correct answer. Each step should be structured as follows: Step #n: {{ "description": "tool": , From 34f4080ff59b1668d919a1ba9f8bc4a3a2a3f478 Mon Sep 17 00:00:00 2001 From: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Date: Mon, 9 Dec 2024 13:55:16 -0500 Subject: [PATCH 16/62] [CI] Fix bnb quantization tests with accelerate>=1.2.0 (#35172) --- tests/quantization/bnb/test_4bit.py | 8 ++++---- tests/quantization/bnb/test_mixed_int8.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/quantization/bnb/test_4bit.py b/tests/quantization/bnb/test_4bit.py index 3eae429abb206a..9512d0aa70af97 100644 --- a/tests/quantization/bnb/test_4bit.py +++ b/tests/quantization/bnb/test_4bit.py @@ -385,14 +385,14 @@ def test_inference_without_keep_in_fp32(self): # test with `google-t5/t5-small` model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_4bit=True, device_map="auto") - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_4bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) T5ForConditionalGeneration._keep_in_fp32_modules = modules @@ -410,14 +410,14 @@ def test_inference_with_keep_in_fp32(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear4bit)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_4bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index 567aa956271b70..158fdfaf71dc5c 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -514,14 +514,14 @@ def test_inference_without_keep_in_fp32(self): # test with `google-t5/t5-small` model = T5ForConditionalGeneration.from_pretrained(self.model_name, load_in_8bit=True, device_map="auto") - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) T5ForConditionalGeneration._keep_in_fp32_modules = modules @@ -540,14 +540,14 @@ def test_inference_with_keep_in_fp32(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) def test_inference_with_keep_in_fp32_serialized(self): @@ -571,14 +571,14 @@ def test_inference_with_keep_in_fp32_serialized(self): # there was a bug with decoders - this test checks that it is fixed self.assertTrue(isinstance(model.decoder.block[0].layer[0].SelfAttention.q, bnb.nn.Linear8bitLt)) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) # test with `flan-t5-small` model = T5ForConditionalGeneration.from_pretrained( self.dense_act_model_name, load_in_8bit=True, device_map="auto" ) - encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(torch_device) + encoded_input = self.tokenizer(self.input_text, return_tensors="pt").to(model.device) _ = model.generate(**encoded_input) From dada0fd85f13c68dd7506aba0740083523bcf2df Mon Sep 17 00:00:00 2001 From: Spiros Dontas Date: Tue, 10 Dec 2024 09:40:40 +0200 Subject: [PATCH 17/62] Fix `num_items_in_batch` not being an integer (#35115) In method `Trainer#get_batch_samples`, the return values should be a list of batch samples and an integer indicating the number of items that exist in the batch. However, this was not actually a case and what was returned instead of an integer, was a tensor with one element. In the multi-GPU setup, this tensor is placed in a different device than the loss tensor, causing the loss function to raise a `RuntimeError`. The problem arises from https://github.com/huggingface/transformers/blob/5d7739f15a6e50de416977fe2cc9cb516d67edda/src/transformers/trainer.py#L5139-L5144, where the outer `sum` operates over a list of tensors which means that the final result is also a tensor. To counter this issue, a new check (after the accelerator gathering) has been added in order to convert a potential tensor to an integer before returning the `num_items_in_batch`. --- src/transformers/trainer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f7d79481809807..be41a415e5a710 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -5138,4 +5138,8 @@ def get_batch_samples(self, epoch_iterator, num_batches): if self.args.average_tokens_across_devices: num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum().item() + + if torch.is_tensor(num_items_in_batch): + num_items_in_batch = num_items_in_batch.item() + return batch_samples, num_items_in_batch From 0938b57770db818b3d818bc00fa989fa206087dd Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 10 Dec 2024 09:59:17 +0100 Subject: [PATCH 18/62] Assisted decoding multi-gpu (#35116) * fix * move a few lines up --- src/transformers/generation/utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 89c57cb913fec2..fe634141eca09b 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -4260,9 +4260,10 @@ def _assisted_decoding( while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device): cur_len = input_ids.shape[-1] - # 1. Fetch candidate sequences from a `CandidateGenerator` + # 1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids) + candidate_input_ids = candidate_input_ids.to(self.device) if candidate_logits is not None: candidate_logits = candidate_logits.to(self.device) From 80f2b1610fa17ebf582897c8611180cac38652f0 Mon Sep 17 00:00:00 2001 From: Trevor Royer Date: Tue, 10 Dec 2024 02:11:45 -0700 Subject: [PATCH 19/62] Fix file path for shard_num 1 with mllama converter (#35053) "#35049 fix path for num_shard 1" --- .../models/mllama/convert_mllama_weights_to_hf.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py index ca22d31ee3ca5e..b2c40e27bb2b40 100644 --- a/src/transformers/models/mllama/convert_mllama_weights_to_hf.py +++ b/src/transformers/models/mllama/convert_mllama_weights_to_hf.py @@ -338,7 +338,11 @@ def write_model( print(f"Fetching all parameters from the checkpoint at {input_base_path}...") if num_shards == 1: - loaded = [torch.load(os.path.join(input_base_path, "consolidated.pth"), map_location="cpu", mmap=True)] + if os.path.exists(os.path.join(input_base_path, "consolidated.00.pth")): + path = os.path.join(input_base_path, "consolidated.00.pth") + else: + path = os.path.join(input_base_path, "consolidated.pth") + loaded = [torch.load(path, map_location="cpu", mmap=True)] else: loaded = [ torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pth"), map_location="cpu", mmap=True) From 6acb4e43a7a0c89830bc3658cb425477e6934be5 Mon Sep 17 00:00:00 2001 From: Gallil Maimon <62688880+gallilmaimon@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:18:23 +0200 Subject: [PATCH 20/62] Support BatchNorm in Hubert pos_conv_emb as in fairseq (#34389) * Support BatchNorm in Hubert pos_conv_emb as in fairseq * Correct the new defaults (#34377) * Correct the new defaults * CIs * add check * Update utils.py * Update utils.py * Add the max_length in generate test checking shape without passing length * style * CIs * fix fx CI issue * [auto. ping] Avoid sending empty info + add more team members (#34383) * update * update --------- Co-authored-by: ydshieh * Fix glm (#34388) * Fix duplicated * fix import * Use non nested images and batched text Idefics2/3 (#34222) * add support for non nested images and add tests * add tests error scenario * fix style * added single and no image to error tests * Fix onnx non-expotable inplace aten op (#34376) * fix onnx non-expotable inplace op * mistral, qwen2, qwen2_vl, starcoder2 * fixup copies * Fix right padding in LLaVA models (#34305) * fix right pad llavas * device mismatch * no filter (#34391) * no filter * no filter * no filter --------- Co-authored-by: ydshieh * SynthID: better example (#34372) * better example * Update src/transformers/generation/configuration_utils.py * Update src/transformers/generation/logits_process.py * nits * Tests: upgrade `test_eager_matches_sdpa_generate` (#34386) * Fix bnb training test failure (#34414) * Fix bnb training test: compatibility with OPTSdpaAttention * Avoid check expected exception when it is on CUDA (#34408) * update * update --------- Co-authored-by: ydshieh * Fix typos in agents_advanced.md (#34405) * [docs] Cache implementations (#34325) cache * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq Add conversion integration test, and make batchnorm explicit variable * Support BatchNorm in Hubert pos_conv_emb as in fairseq fix make fixup styling changes * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq * [run-slow] hubert * Support BatchNorm in Hubert pos_conv_emb as in fairseq Add conversion integration test, and make batchnorm explicit variable * Support BatchNorm in Hubert pos_conv_emb as in fairseq fix make fixup styling changes * [run-slow] hubert * [run-slow] hubert --------- Co-authored-by: Cyril Vallez Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: ydshieh Co-authored-by: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com> Co-authored-by: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Co-authored-by: Raushan Turganbay Co-authored-by: Joao Gante Co-authored-by: Matthew Douglas <38992547+matthewdouglas@users.noreply.github.com> Co-authored-by: Rudy Delouya Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> --- .../models/hubert/configuration_hubert.py | 4 ++ ..._original_pytorch_checkpoint_to_pytorch.py | 15 ++++++- .../models/hubert/modeling_hubert.py | 40 ++++++++++--------- tests/models/hubert/test_modeling_hubert.py | 37 +++++++++++++++++ 4 files changed, 77 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/hubert/configuration_hubert.py b/src/transformers/models/hubert/configuration_hubert.py index 20977cff87d167..9f488b19888957 100644 --- a/src/transformers/models/hubert/configuration_hubert.py +++ b/src/transformers/models/hubert/configuration_hubert.py @@ -94,6 +94,8 @@ class HubertConfig(PretrainedConfig): embeddings layer. num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16): Number of groups of 1D convolutional positional embeddings layer. + conv_pos_batch_norm (`bool`, *optional*, defaults to `False`): + Whether to use batch norm instead of weight norm in conv_pos do_stable_layer_norm (`bool`, *optional*, defaults to `False`): Whether do apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is @@ -182,6 +184,7 @@ def __init__( conv_bias=False, num_conv_pos_embeddings=128, num_conv_pos_embedding_groups=16, + conv_pos_batch_norm=False, do_stable_layer_norm=False, apply_spec_augment=True, mask_time_prob=0.05, @@ -209,6 +212,7 @@ def __init__( self.conv_bias = conv_bias self.num_conv_pos_embeddings = num_conv_pos_embeddings self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups + self.conv_pos_batch_norm = conv_pos_batch_norm self.num_feat_extract_layers = len(self.conv_dim) self.num_hidden_layers = num_hidden_layers self.intermediate_size = intermediate_size diff --git a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py index 6478fdadf13de3..4966340493f35c 100644 --- a/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py +++ b/src/transformers/models/hubert/convert_hubert_original_pytorch_checkpoint_to_pytorch.py @@ -38,7 +38,8 @@ MAPPING = { "post_extract_proj": "feature_projection.projection", - "encoder.pos_conv.0": "encoder.pos_conv_embed.conv", + "encoder.pos_conv.0": "encoder.pos_conv_embed.batch_norm", + "encoder.pos_conv.1": "encoder.pos_conv_embed.conv", "self_attn.k_proj": "encoder.layers.*.attention.k_proj", "self_attn.v_proj": "encoder.layers.*.attention.v_proj", "self_attn.q_proj": "encoder.layers.*.attention.q_proj", @@ -76,6 +77,12 @@ def set_recursively(hf_pointer, key, value, full_name, weight_type): hf_pointer.weight_v.data = value elif weight_type == "bias": hf_pointer.bias.data = value + elif weight_type == "running_mean": + hf_pointer.running_mean.data = value + elif weight_type == "running_var": + hf_pointer.running_var.data = value + elif weight_type == "num_batches_tracked": + hf_pointer.num_batches_tracked.data = value else: hf_pointer.data = value @@ -116,6 +123,12 @@ def recursively_load_weights(fairseq_model, hf_model, is_finetuned): weight_type = "weight" elif "bias" in name: weight_type = "bias" + elif "running_mean" in name: + weight_type = "running_mean" + elif "running_var" in name: + weight_type = "running_var" + elif "num_batches_tracked" in name: + weight_type = "num_batches_tracked" else: weight_type = None set_recursively(hf_model, mapped_key, value, name, weight_type) diff --git a/src/transformers/models/hubert/modeling_hubert.py b/src/transformers/models/hubert/modeling_hubert.py index 57f59cf9aab94f..03904a6abfa08b 100755 --- a/src/transformers/models/hubert/modeling_hubert.py +++ b/src/transformers/models/hubert/modeling_hubert.py @@ -260,7 +260,6 @@ def forward(self, hidden_states): return hidden_states -# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Hubert class HubertPositionalConvEmbedding(nn.Module): def __init__(self, config): super().__init__() @@ -272,32 +271,37 @@ def __init__(self, config): groups=config.num_conv_pos_embedding_groups, ) - weight_norm = nn.utils.weight_norm - if hasattr(nn.utils.parametrizations, "weight_norm"): - weight_norm = nn.utils.parametrizations.weight_norm + self.batch_norm = None + if config.conv_pos_batch_norm: + self.batch_norm = nn.BatchNorm1d(config.hidden_size) + else: + weight_norm = nn.utils.weight_norm + if hasattr(nn.utils.parametrizations, "weight_norm"): + weight_norm = nn.utils.parametrizations.weight_norm - if is_deepspeed_zero3_enabled(): - import deepspeed + if is_deepspeed_zero3_enabled(): + import deepspeed - with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): - self.conv = weight_norm(self.conv, name="weight", dim=2) - if hasattr(self.conv, "parametrizations"): - weight_g = self.conv.parametrizations.weight.original0 - weight_v = self.conv.parametrizations.weight.original1 + with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0): + self.conv = weight_norm(self.conv, name="weight", dim=2) + if hasattr(self.conv, "parametrizations"): + weight_g = self.conv.parametrizations.weight.original0 + weight_v = self.conv.parametrizations.weight.original1 + else: + weight_g = self.conv.weight_g + weight_v = self.conv.weight_v + deepspeed.zero.register_external_parameter(self, weight_v) + deepspeed.zero.register_external_parameter(self, weight_g) else: - weight_g = self.conv.weight_g - weight_v = self.conv.weight_v - deepspeed.zero.register_external_parameter(self, weight_v) - deepspeed.zero.register_external_parameter(self, weight_g) - else: - self.conv = weight_norm(self.conv, name="weight", dim=2) + self.conv = weight_norm(self.conv, name="weight", dim=2) self.padding = HubertSamePadLayer(config.num_conv_pos_embeddings) self.activation = ACT2FN[config.feat_extract_activation] def forward(self, hidden_states): hidden_states = hidden_states.transpose(1, 2) - + if self.batch_norm is not None: + hidden_states = self.batch_norm(hidden_states) hidden_states = self.conv(hidden_states) hidden_states = self.padding(hidden_states) hidden_states = self.activation(hidden_states) diff --git a/tests/models/hubert/test_modeling_hubert.py b/tests/models/hubert/test_modeling_hubert.py index 86f2b4119324ae..191d2f8c88c380 100644 --- a/tests/models/hubert/test_modeling_hubert.py +++ b/tests/models/hubert/test_modeling_hubert.py @@ -943,3 +943,40 @@ def test_inference_distilhubert(self): self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) + + def test_inference_hubert_25hz(self): + model = HubertModel.from_pretrained("slprl/mhubert-base-25hz").to(torch_device) + + sample = self._load_datasamples(1) + input_speech = torch.tensor(sample[0], dtype=torch.float, device=torch_device).unsqueeze(0) + + with torch.no_grad(): + outputs = model(input_speech, output_hidden_states=True).hidden_states[11] + + # expected outputs taken from the original textlesslib implementation by: + # model = SpeechEncoder.by_name(dense_model_name='mhubert-base-25hz', quantizer_model_name='kmeans', + # vocab_size=500, deduplicate=False, need_f0=False) + # model(wav)['dense'] + expected_outputs_first = torch.tensor( + [ + [0.0267, 0.1776, -0.1706, -0.4559], + [-0.2430, -0.2943, -0.1864, -0.1187], + [-0.1812, -0.4239, -0.1916, -0.0858], + [-0.1495, -0.4758, -0.4036, 0.0302], + ], + device=torch_device, + ) + expected_outputs_last = torch.tensor( + [ + [0.3366, -0.2734, -0.1415, -0.3055], + [0.2329, -0.3580, -0.1421, -0.3197], + [0.1631, -0.4301, -0.1965, -0.2956], + [0.3342, -0.2185, -0.2253, -0.2363], + ], + device=torch_device, + ) + expected_output_sum = 1681.7603 + + self.assertTrue(torch.allclose(outputs[:, :4, :4], expected_outputs_first, atol=5e-3)) + self.assertTrue(torch.allclose(outputs[:, -4:, -4:], expected_outputs_last, atol=5e-3)) + self.assertTrue(abs(outputs.sum() - expected_output_sum) < 0.1) From 5fba3f99c0a44f7613aeaa0550f7786919780663 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Tue, 10 Dec 2024 14:52:20 +0100 Subject: [PATCH 21/62] Remove unnecessary masked_fill in deberta models (#35182) --- src/transformers/models/deberta/modeling_deberta.py | 1 - src/transformers/models/deberta_v2/modeling_deberta_v2.py | 1 - 2 files changed, 2 deletions(-) diff --git a/src/transformers/models/deberta/modeling_deberta.py b/src/transformers/models/deberta/modeling_deberta.py index 6993121b6c1ebe..c9a85bcad1bd6f 100644 --- a/src/transformers/models/deberta/modeling_deberta.py +++ b/src/transformers/models/deberta/modeling_deberta.py @@ -290,7 +290,6 @@ def forward( attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) # bsz x height x length x dimension attention_probs = nn.functional.softmax(attention_scores, dim=-1) - attention_probs.masked_fill(attention_mask, 0) attention_probs = self.dropout(attention_probs) if self.head_weights_proj is not None: diff --git a/src/transformers/models/deberta_v2/modeling_deberta_v2.py b/src/transformers/models/deberta_v2/modeling_deberta_v2.py index 6645c1de832e12..7d2f25603a6f96 100644 --- a/src/transformers/models/deberta_v2/modeling_deberta_v2.py +++ b/src/transformers/models/deberta_v2/modeling_deberta_v2.py @@ -267,7 +267,6 @@ def forward( attention_scores = attention_scores.masked_fill(~(attention_mask), torch.finfo(query_layer.dtype).min) # bsz x height x length x dimension attention_probs = nn.functional.softmax(attention_scores, dim=-1) - attention_probs.masked_fill(attention_mask, 0) attention_probs = self.dropout(attention_probs) context_layer = torch.bmm( From 3e2769a3c998599a9b84b80d9ef8c81b29476276 Mon Sep 17 00:00:00 2001 From: "Huang, Guangtai" Date: Tue, 10 Dec 2024 06:31:22 -0800 Subject: [PATCH 22/62] Fix DBRX LayerNorm init method (#35177) fix dbrx layernorm init --- src/transformers/models/dbrx/modeling_dbrx.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py index 659fa154ecf776..7d20b766658f23 100644 --- a/src/transformers/models/dbrx/modeling_dbrx.py +++ b/src/transformers/models/dbrx/modeling_dbrx.py @@ -845,7 +845,7 @@ def _init_weights(self, module: nn.Module): if module.padding_idx is not None: module.weight.data[module.padding_idx].zero_() elif isinstance(module, nn.LayerNorm): - module.weight.data.normal_(mean=0.0, std=std) + module.weight.data.fill_(1.0) if module.bias is not None: module.bias.data.zero_() elif isinstance(module, DbrxExpertGLU): From e5c45a667978ecdcfc9d58ae1dae4ad67401ab39 Mon Sep 17 00:00:00 2001 From: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:30:09 +0100 Subject: [PATCH 23/62] Fixing GGUF support for StableLm (#35060) fix Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> --- src/transformers/modeling_gguf_pytorch_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index 7562649be753bb..00c080fbea81c7 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -307,7 +307,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): ffn_norm_name = "ffn_norm" qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name) use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors) - parsed_parameters["config"]["qkv_bias"] = qkv_bias + parsed_parameters["config"]["use_qkv_bias"] = qkv_bias parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual model_size = "" From 425af6cdc20d20e93fb814e6932c1b194b3f2915 Mon Sep 17 00:00:00 2001 From: Ahmed Almaghz <53489256+AhmedAlmaghz@users.noreply.github.com> Date: Tue, 10 Dec 2024 20:08:27 +0300 Subject: [PATCH 24/62] [i18n-ar] Translated file : `docs/source/ar/community.md` into Arabic (#33027) * Add docs/source/ar/community.md to Add_docs_source_ar_community.md * Update community.md * Update community.md * Update community.md * Update _toctree.yml - add community.md * Update docs/source/ar/community.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Create how_to_hack_models.md * Create modular_transformers.md * Create tiktoken.md * Update _toctree.yml * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/how_to_hack_models.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/modular_transformers.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/tiktoken.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> * Update docs/source/ar/tiktoken.md Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --------- Co-authored-by: Abdullah Mohammed <554032+abodacs@users.noreply.github.com> --- docs/source/ar/_toctree.yml | 10 +- docs/source/ar/community.md | 66 +++++++++ docs/source/ar/how_to_hack_models.md | 163 ++++++++++++++++++++++ docs/source/ar/modular_transformers.md | 184 +++++++++++++++++++++++++ docs/source/ar/tiktoken.md | 41 ++++++ 5 files changed, 462 insertions(+), 2 deletions(-) create mode 100644 docs/source/ar/community.md create mode 100644 docs/source/ar/how_to_hack_models.md create mode 100644 docs/source/ar/modular_transformers.md create mode 100644 docs/source/ar/tiktoken.md diff --git a/docs/source/ar/_toctree.yml b/docs/source/ar/_toctree.yml index 1208153c22df68..138d3a1bd8aa08 100644 --- a/docs/source/ar/_toctree.yml +++ b/docs/source/ar/_toctree.yml @@ -133,12 +133,18 @@ title: المعايير - local: notebooks title: دفاتر الملاحظات مع الأمثلة -# - local: community -# title: موارد المجتمع + - local: community + title: موارد المجتمع - local: troubleshooting title: استكشاف الأخطاء وإصلاحها - local: gguf title: التوافق مع ملفات GGUF + - local: tiktoken + title: التوافق مع ملفات TikToken + - local: modular_transformers + title: الوحدات النمطية في `transformers` + - local: how_to_hack_models + title: اختراق النموذج (الكتابة فوق فئة لاستخدامك) title: أدلة المطورين # - sections: # - local: quantization/overview diff --git a/docs/source/ar/community.md b/docs/source/ar/community.md new file mode 100644 index 00000000000000..5a1c31de0aaa3f --- /dev/null +++ b/docs/source/ar/community.md @@ -0,0 +1,66 @@ +# مجتمع المطورين + +هذه الصفحة تجمع الموارد حول 🤗 Transformers التي طورها المجتمع. + +## موارد المجتمع: + +| المصدر | الوصف | المؤلف | +|:----------|:-------------|------:| +| [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | مجموعة من البطاقات التعليمية القائمة على [Transformers Docs Glossary](glossary) والتي تم وضعها في شكل يمكن تعلمه/مراجعته بسهولة باستخدام [Anki](https://apps.ankiweb.net/) وهو تطبيق مفتوح المصدر متعدد المنصات مصمم خصيصًا للاحتفاظ بالمعرفة على المدى الطويل. شاهد هذا [فيديو تمهيدي حول كيفية استخدام البطاقات التعليمية](https://www.youtube.com/watch?v=Dji_7PILrw). | [Darigov Research](https://www.darigovresearch.com/) | + +## دفاتر ملاحظات المجتمع: + +| الدفتر | الوصف | المؤلف | | +|:----------|:-------------|:-------------|------:| +| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | كيفية توليد كلمات الأغاني على غرار فنانك المفضل من خلال ضبط نموذج GPT-2 | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) | +| [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | كيفية تدريب T5 لأي مهمة باستخدام Tensorflow 2. يوضح هذا الدفتر مهمة السؤال والجواب المنفذة في Tensorflow 2 باستخدام SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) | +| [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | كيفية تدريب T5 على SQUAD مع Transformers و Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) | +| [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | كيفية ضبط نموذج T5 للتصنيف والمهام متعددة الخيارات باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | +| [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | كيفية ضبط نموذج DialoGPT على مجموعة بيانات جديدة لروبوتات الدردشة المحادثية المفتوحة | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | +| [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | كيفية التدريب على تسلسلات طويلة تصل إلى 500,000 رمز باستخدام Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | +| [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | كيفية ضبط نموذج BART للتلخيص باستخدام fastai باستخدام blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | +| [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | كيفية توليد تغريدات على غرار حساب Twitter المفضل لديك من خلال ضبط نموذج GPT-2 | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | +| [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | دليل كامل لعرض تكامل W&B مع Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | +| [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | كيفية بناء نسخة "طويلة" من النماذج المسبقة التدريب الموجودة | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | +| [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | كيفية ضبط نموذج Longformer لمهمة QA | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | +| [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | كيفية تقييم نموذج Longformer على TriviaQA مع `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) | +| [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | كيفية ضبط نموذج T5 لاستخراج المشاعر باستخدام تنسيق النص إلى نص مع PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | +| [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | كيفية ضبط نموذج DistilBert للتصنيف متعدد الفئات باستخدام PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)| +|[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|كيفية ضبط نموذج BERT للتصنيف متعدد التصنيفات باستخدام PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)| +|[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|كيفية ضبط نموذج T5 للتلخيص في PyTorch وتتبع التجارب باستخدام WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)| +|[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|كيفية تسريع الضبط الدقيق بعامل 2 باستخدام الضبط الديناميكي/التقسيم|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)| +|[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| كيفية تدريب نموذج Reformer مع طبقات الانتباه ثنائية الاتجاه | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)| +|[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| كيفية زيادة مفردات نموذج SciBERT المسبق التدريب من AllenAI على مجموعة بيانات CORD وإنشاء خط أنابيب لها. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)| +|[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| كيفية ضبط نموذج BlenderBotSmall للتلخيص على مجموعة بيانات مخصصة، باستخدام واجهة برمجة التطبيقات Trainer. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)| +|[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | كيفية ضبط نموذج Electra للتحليل العاطفي وتفسير التنبؤات باستخدام Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)| +|[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | كيفية ضبط نموذج GPT-2 غير الإنجليزي باستخدام فئة Trainer | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)| +|[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | كيفية ضبط نموذج DistilBERT لمهمة التصنيف متعدد التصنيفات | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)| +|[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | كيفية ضبط نموذج ALBERT أو أي نموذج آخر قائم على BERT لمهمة التصنيف المزدوج للجمل | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)| +|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | كيفية ضبط نموذج Roberta للتحليل العاطفي | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)| +|[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | ما مدى دقة الإجابات على الأسئلة التي يولدها نموذجك التحويلي seq2seq؟ | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)| +|[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | كيفية ضبط نموذج DistilBERT للتصنيف النصي في TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)| +|[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* مع نقطة تفتيش *google-bert/bert-base-uncased* للتلخيص على CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)| +|[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | كيفية البدء السريع لنموذج *EncoderDecoderModel* المشترك مع نقطة تفتيش *FacebookAI/roberta-base* للتلخيص على BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)| +|[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | كيفية ضبط نموذج *TapasForQuestionAnswering* مع نقطة تفتيش *tapas-base* على مجموعة بيانات Sequential Question Answering (SQA) | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)| +|[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | كيفية تقييم نموذج *TapasForSequenceClassification* المضبوط مسبقًا مع نقطة تفتيش *tapas-base-finetuned-tabfact* باستخدام مزيج من مكتبتي 🤗 datasets و 🤗 transformers | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)| +|[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | كيفية ضبط نموذج mBART باستخدام Seq2SeqTrainer للترجمة من الهندية إلى الإنجليزية | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)| +|[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | كيفية ضبط نموذج *LayoutLMForTokenClassification* على مجموعة بيانات FUNSD لاستخراج المعلومات من المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)| +|[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | كيفية ضبط نموذج DistilGPT2 وتوليد النص | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)| +|[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | كيفية ضبط نموذج LED على pubmed للتلخيص طويل المدى | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)| +|[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | كيفية تقييم نموذج LED للتلخيص طويل المدى بشكل فعال | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)| +|[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | كيفية ضبط نموذج *LayoutLMForSequenceClassification* على مجموعة بيانات RVL-CDIP لتصنيف المستندات الممسوحة ضوئيًا | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)| +|[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | كيفية فك تشفير تسلسل CTC مع تعديل نموذج اللغة | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_zQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)| +|[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | كيفية ضبط نموذج BART للتلخيص بلغتين باستخدام فئة Trainer | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)| +|[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | كيفية تقييم نموذج BigBird للأسئلة والأجوبة على وثائق طويلة على Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)| +| [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | كيفية إنشاء تعليقات توضيحية على YouTube من أي فيديو من خلال تفريغ الصوت باستخدام Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | +| [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | كيفية ضبط نموذج Vision Transformer (ViT) على CIFAR-10 باستخدام مكتبات HuggingFace Transformers و Datasets و 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | +| [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | كيفية تقييم نموذج *LukeForEntityClassification* على مجموعة بيانات Open Entity | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | +| [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | كيفية تقييم نموذج *LukeForEntityPairClassification* على مجموعة بيانات TACRED | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | +| [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | كيفية تقييم نموذج *LukeForEntitySpanClassification* على مجموعة بيانات CoNLL-2003 | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | +| [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | كيفية تقييم نموذج *BigBirdPegasusForConditionalGeneration* على مجموعة بيانات PubMed | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | +| [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | كيفية استخدام نموذج Wav2Vec2 المسبق التدريب لتصنيف المشاعر على مجموعة بيانات MEGA | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | +| [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | كيفية استخدام نموذج *DetrForObjectDetection* المدرب للكشف عن الأجسام في صورة وتصوير الانتباه | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | +| [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | كيفية ضبط نموذج *DetrForObjectDetection* على مجموعة بيانات الكشف عن الأجسام المخصصة | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | +| [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | كيفية ضبط نموذج *T5* على مهمة التعرف على الكيانات المسماة | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) | +| [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | كيفية استخدام [QLoRA](https://github.com/artidoro/qlora) و [PEFT](https://huggingface.co/docs/peft/en/index) لضبط نموذج LLM بطريقة فعالة من حيث الذاكرة، مع استخدام [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) لإدارة تتبع التجارب | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | diff --git a/docs/source/ar/how_to_hack_models.md b/docs/source/ar/how_to_hack_models.md new file mode 100644 index 00000000000000..8ce3589732f06a --- /dev/null +++ b/docs/source/ar/how_to_hack_models.md @@ -0,0 +1,163 @@ +# كيفية تعديل أي نموذج من نماذج Transformers + +توفر مكتبة [🤗 Transformers](https://github.com/huggingface/transformers) مجموعة من النماذج المسبقة التدريب والأدوات لمعالجة اللغات الطبيعية، والرؤية، وما إلى ذلك. على الرغم من أن هذه النماذج تغطي مجموعة واسعة من التطبيقات، فقد تواجه حالات استخدام لا تدعمها المكتبة بشكل افتراضي. يُمكن للتخصيص أن يفتح إمكانيات جديدة، مثل إضافة طبقات جديدة، أو تعديل البنية المعمارية، أو تحسين آليات الانتباه. سيُوضح لك هذا الدليل كيفية تعديل نماذج Transformers الموجودة لتلبية احتياجاتك المحددة. الشيء الرائع هو أنك لست بحاجة إلى الخروج من إطار عمل Transformers لإجراء هذه التغييرات. ي يمكنك تعديل النماذج مباشرةً في Transformers والاستفادة من الميزات مثل [واجهة برمجة التطبيقات Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer)، و [PreTrainedModel](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel)، والضبط الدقيق الفعال باستخدام أدوات مثل [PEFT](https://huggingface.co/docs/peft/index). + +سنرشدك في هذا الدليل لكيفية تخصيص نماذج Transformers الموجودة لتلبية متطلباتك، دون فقدان مزايا الإطار. ستتعلم كيفية: + +- تعديل بنية نموذج ما من خلال تغيير آلية الانتباه الخاصة به. +- تطبيق تقنيات مثل Low-Rank Adaptation (LoRA) على مكونات نموذج محددة. + +نحن نشجعك على المساهمة باختراقاتك الخاصة ومشاركتها هنا مع المجتمع1 + +## مثال: تعديل آلية الانتباه في نموذج Segment Anything (SAM) + +نموذج **Segment Anything (SAM)** هو نموذج رائد في مجال تجزئة الصور. في تنفيذه الافتراضي، يستخدم SAM إسقاطًا مجمعًا للاستعلام والمفتاح والقيمة (`qkv`) في آلية الانتباه الخاصة به. ومع ذلك، قد ترغب في ضبط مكونات محددة فقط من آلية الانتباه، مثل إسقاطات الاستعلام (`q`) والقيمة (`v`)، لتقليل عدد المعلمات القابلة للتدريب والموارد الحسابية المطلوبة. + +### الدافع + +من خلال تقسيم الإسقاط المجمع `qkv` إلى إسقاطات منفصلة `q` و `k` و `v`، يمكنك تطبيق تقنيات مثل **LoRA** (Low-Rank Adaptation) على إسقاطي `q` و `v` فقط. يسمح لك هذا بما يلي: + +- ضبط عدد أقل من المعلمات، مما يقلل من العبء الحسابي. +- تحقيق أداء أفضل من خلال التركيز على مكونات محددة. +- تجربة استراتيجيات تعديل مختلفة في آلية الانتباه. + +### التنفيذ + +#### **الخطوة 1: إنشاء فئة اهتمام مخصصة** + +بعد ذلك، قم بإنشاء فئة فرعية من فئة `SamVisionAttention` الأصلية وعدلها لتضم إسقاطات `q` و `k` و `v` منفصلة. + +```python +import torch +import torch.nn as nn +from transformers.models.sam.modeling_sam import SamVisionAttention + +class SamVisionAttentionSplit(SamVisionAttention, nn.Module): + def __init__(self, config, window_size): + super().__init__(config, window_size) + del self.qkv + # إسقاطات منفصلة q و k و v + self.q = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.k = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self.v = nn.Linear(config.hidden_size, config.hidden_size, bias=config.qkv_bias) + self._register_load_state_dict_pre_hook(self.split_q_k_v_load_hook) + + def split_q_k_v_load_hook(self, state_dict, prefix, *args): + keys_to_delete = [] + for key in list(state_dict.keys()): + if "qkv." in key: + # تقسيم q و k و v من الإسقاط المجمع + q, k, v = state_dict[key].chunk(3, dim=0) + # استبدال الإسقاطات الفردية q و k و v + state_dict[key.replace("qkv.", "q.")] = q + state_dict[key.replace("qkv.", "k.")] = k + state_dict[key.replace("qkv.", "v.")] = v + # وضع علامة على مفتاح qkv القديم للحذف + keys_to_delete.append(key) + + # حذف مفاتيح qkv القديمة + for key in keys_to_delete: + del state_dict[key] + + def forward(self, hidden_states: torch.Tensor, output_attentions=False) -> torch.Tensor: + batch_size, height, width, _ = hidden_states.shape + qkv_shapes = (batch_size * self.num_attention_heads, height * width, -1) + query = self.q(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + key = self.k(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + value = self.v(hidden_states).reshape((batch_size, height * width,self.num_attention_heads, -1)).permute(0,2,1,3).reshape(qkv_shapes) + + attn_weights = (query * self.scale) @ key.transpose(-2, -1) + + if self.use_rel_pos: + attn_weights = self.add_decomposed_rel_pos( + attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width) + ) + + attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype) + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1) + attn_output = attn_output.permute(0, 2, 3, 1, 4).reshape(batch_size, height, width, -1) + attn_output = self.proj(attn_output) + + if output_attentions: + outputs = (attn_output, attn_weights) + else: + outputs = (attn_output, None) + return outputs +``` + +**الشرح:** + +- **الإسقاطات المنفصلة:** يتم إزالة الإسقاط المُجمع `qkv`، وإنشاء إسقاطات خطية منفصلة `q` و `k` و `v`. +- **دالة استدعاء تحميل الأوزان:** تقوم طريقة `_split_qkv_load_hook` بتقسيم أوزان `qkv` المسبقة التدريب إلى أوزان `q` و `k` و `v` منفصلة عند تحميل النموذج. يضمن هذا التوافق مع أي نموذج مسبق التدريب. +- **التنفيذ الأمامي:** يتم حساب الاستعلامات والمفاتيح والقيم بشكل منفصل، وتستمر آلية الانتباه كالمعتاد. + +#### **الخطوة 2: استبدال فئة الانتباه الأصلية** + +استبدل فئة `SamVisionAttention` الأصلية بفئتك المخصصة بحيث يستخدم النموذج آلية الانتباه المعدلة. + +```python +from transformers import SamModel +from transformers.models.sam import modeling_sam + +# استبدال فئة الاهتمام في وحدة نمطية modeling_sam +modeling_sam.SamVisionAttention = SamVisionAttentionSplit + +# تحميل نموذج SAM المسبق التدريب +model = SamModel.from_pretrained("facebook/sam-vit-base") +``` + +**الشرح:** + +- **استبدال الفئة:** من خلال تعيين فئتك المخصصة إلى `modeling_sam.SamVisionAttention`، فإن أي حالات من فئة `SamVisionAttention` في النموذج ستستخدم النسخة المعدلة. وبالتالي، عند استدعاء `SamModel`، سيتم استخدام `SamVisionAttentionSplit` المحددة حديثًا. +- **تحميل النموذج:** يتم تحميل النموذج باستخدام `from_pretrained`، ويتم دمج آلية الانتباه المخصصة. + +#### **الخطوة 3: تطبيق LoRA على إسقاطات محددة** + +مع وجود إسقاطات `q` و `k` و `v` منفصلة، يمكنك الآن تطبيق LoRA على مكونات محددة، مثل إسقاطات `q` و `v`. + +```python +from peft import LoraConfig, get_peft_model + +config = LoraConfig( + r=16, + lora_alpha=32, + target_modules=["q", "v"], # تطبيق LoRA على إسقاطات q و v + lora_dropout=0.1, + task_type="mask-generation" +) + +# تطبيق LoRA على النموذج +model = get_peft_model(model, config) +``` + +**الشرح:** + +- **تكوين LoRA:** تحدد `LoraConfig` المرتبة `r`، وعامل القياس `lora_alpha`، والوحدات المستهدفة (`"q"` و `"v"`)، ومعدل التخلي، ونوع المهمة. +- **تطبيق LoRA:** تقوم دالة `get_peft_model` بتطبيق LoRA على الوحدات المحددة في النموذج. +- **تقليل المعلمات:** من خلال التركيز على `q` و `v`، فإنك تقلل عدد المعلمات القابلة للتدريب، مما يؤدي إلى تسريع التدريب وتقليل استخدام الذاكرة. + +#### **الخطوة 4: التحقق من عدد المعلمات القابلة للتدريب** + +من السهل التحقق من عدد المعلمات القابلة للتدريب ومعرفة تأثير تعديلك. + +```python +model.print_trainable_parameters() +``` + +**الناتج المتوقع:** + +``` +عدد المعلمات القابلة للتدريب: 608,256 || جميع المعلمات: 94,343,728 || نسبة المعلمات القابلة للتدريب: 0.6447 +عدد المعلمات القابلة للتدريب: 912,384 || جميع المعلمات: 94,647,856 || نسبة المعلمات القابلة للتدريب: 0.9640 # مع k +``` + +## المساهمة بابداعاتك الخاصة + +يمكن لتعديل النماذج المسبقة التدريب أن يفتح آفاقًا جديدة للبحث والتطبيق. من خلال فهم وتعديل الآليات الداخلية للنماذج مثل SAM، يمكنك تخصيصها لتلبية احتياجاتك المحددة، وتحسين الأداء، وتجربة أفكار جديدة. + +إذا قمت بتطوير تعديﻻتك الخاصة لنماذج Transformers وترغب في مشاركتها، ففكر في المساهمة في هذه الوثيقة. + +- **إنشاء طلب سحب (Pull Request):** شارك تغييراتك وتحسيناتك في التعليمات البرمجية مباشرة في المستودع. +- **كتابة التوثيق:** قدم تفسيرات وأمثلة واضحة لتعديلاتك. +- **التفاعل مع المجتمع:** ناقش أفكارك واحصل على تعليقات من المطورين والباحثين الآخرين من خلال فتح مشكلة. diff --git a/docs/source/ar/modular_transformers.md b/docs/source/ar/modular_transformers.md new file mode 100644 index 00000000000000..b500fec1c92d25 --- /dev/null +++ b/docs/source/ar/modular_transformers.md @@ -0,0 +1,184 @@ +# المحولات النمطية + +مكتبة `transformers` هي إطار عمل ذو فلسفة محدد؛ يتم تعريف فلسفتنا في [الدليل المفاهيمي](./philosophy). + +جوهر هذه الفلسفة يتمثل في مبدأ [نموذج واحد، ملف واحد](https://huggingface.co/blog/transformers-design-philosophy) +في المكتبة. الجانب السلبي لهذا المكون هو تقييده لوراثة واستيراد مكونات الملفات. + +نتيجة لذلك، تتكرر مكونات النموذج عبر العديد من الملفات. يحتوي `transformers` على عدد كبير من طبقات الانتباه، يقارب عدد النماذج، والكثير منها متطابق. يتسبب هذا في تباعد عمليات التنفيذ المستقلة مع تطبيق الإصلاحات والتغييرات. +على أجزاء محددة من التعليمات البرمجية. + +ولمعالجة ذلك، اعتمدنا مفهوم "النسخ" في المكتبة. فبإضافة تعليق يُشير إلى أن التعليمات البرمجية هي نسخة من أخرى، نضمن من خلال أنظمة CI والأوامر المحلية عدم تباعد النسخ. لكن هذه العملية، رغم بساطتها، تُسبب إرهاقاً. كما أنها تزيد العبء على المساهمين، وهو ما نهدف إلى تجاوزه. + +غالباً ما تتطلب مساهمات النماذج إضافة تعليمات برمجية (حوالي 1000 سطر)، ومعالج (حوالي 500 سطر)، واختبارات، ووثائق، إلخ. ونادراً ما تقل مساهمات النماذج عن 3000-5000 سطر من التعليمات البرمجية، معظمها أكواد نمطية. هذا يرفع مستوى المساهمات، + +ونهدف مع المحولات النمطية إلى خفض هذا المستوى إلى حدّ مقبول. + +## ما هو؟ + +تقدم المحولات النمطية مفهوم ملف "نمطي" لمجلد نموذج. يقبل هذا الملف النمطي تعليمات برمجية +غير مقبولة عادة في ملفات النمذجة/المعالجة، حيث يسمح بالاستيراد من نماذج مجاورة وكذلك +الوراثة من الفئات إلى فئات أخرى. + +يعرّف هذا الملف النمطي النماذج والمعالجات وفئة التكوين التي سيتم تعريفها في وحداتهم +المتعلقة. + +وأخيرًا، يقدم هذا الميزة أداة `linter` جديدة والتي ستعمل على "تفكيك" الملف النمطي إلى بنية "نموذج واحد، ملف واحد" +هيكل الدليل. سيتم إنشاء هذه الملفات تلقائيًا في كل مرة يتم فيها تشغيل البرنامج النصي؛ مما يقلل من المساهمات المطلوبة +إلى الملف النمطي، وبالتالي فقط إلى التغييرات بين النموذج المساهم والنماذج الأخرى. + +سيقوم مستخدمو النموذج في النهاية باستيراد واستخدام واجهة الملف الواحد، لذا لا يتوقع حدوث أي تغيير هنا. من خلال القيام بذلك، +نأمل في الجمع بين أفضل ما في العالمين: تمكين المساهمات البسيطة مع الالتزام بفلسفتنا. + +لذلك، هذا بديل لعلامات `# Copied from`، ويمكن توقع انتقال النماذج المساهمة سابقًا إلى +تنسيق المحولات النمطية الجديد في الأشهر المقبلة. + +### التفاصيل + +تُبسط أداة "linter" الوراثة، مُنشئةً جميع الملفات المفردة من الملف النمطي، مع الحفاظ على شفافيتها أمام مستخدمي Python. حاليًا، تُبسط الأداة مستوىً واحدًا من الوراثة + +على سبيل المثال: +- إذا ورثت فئة التكوين من فئة أخرى وأضافت/حذفت معامل، فسيتم إما الإشارة إلى الملف المولد مباشرةً + (في حالة الإضافة) أو إزالته تمامًا (في حالة الحذف). +- إذا ورثت فئة من فئة أخرى، على سبيل المثال: `class GemmaModel(LlamaModel):`، تُستنتج التبعيات تلقائيًا + سيتم استنتاج جميع الوحدات الفرعية تلقائيًا من الفئة الأصلية. +- إذا قمت بتعريف وظائف جديدة في الملف `modular` واستخدمتها داخل الفئات، فستستنتج أداة linter ذلك تلقائيًا + +يجب أن تكون قادرًا على كتابة كل شيء (المجزىء اللغوي، ومُعالِج الصور، والنموذج، والتكوين) في الملف `modular`، وسيتم إنشاء الملفات المُقابلة تلقائيًا. + +### التطبيق + +[TODO] نقدم اختبارًا جديدًا، للتأكد من أن المحتوى المولد يتطابق مع ما هو موجود في `modular_xxxx.py` + +### الأمثلة + +هنا مثال سريع باستخدام BERT و RoBERTa. النموذجان مرتبطان ارتباطًا وثيقًا: يختلف تنفيذهما النموذجي في طبقة تضمين. + +بدلاً من إعادة تعريف النموذج بالكامل، إليك كيف يبدو ملف `modular_roberta.py` لفئات النمذجة والتكوين (لأغراض المثال، يتم تجاهل المجزىء اللغوي في هذا الوقت حيث أنه مختلف جدًا). + +```python +from torch import nn +from ..bert.configuration_bert import BertConfig +from ..bert.modeling_bert import ( + BertModel, + BertEmbeddings, + BertForMaskedLM +) + +# تكوين RoBERTa مطابق لتكوين BERT +class RobertaConfig(BertConfig): + model_type = 'roberta' + +# نعيد تعريف الإضافات هنا لتسليط الضوء على اختلاف معرف الحشو، ونعيد تعريف الإضافات الموضعية +class RobertaEmbeddings(BertEmbeddings): + def __init__(self, config): + super().__init__(config()) + + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + +# نموذج RoBERTa مطابق لنموذج BERT، باستثناء طبقة الإضافات. +# نعيد تعريف الإضافات أعلاه، لذا هنا لا توجد حاجة لعمل إضافي +class RobertaModel(BertModel): + def __init__(self, config): + super().__init__(config) + self.embeddings = RobertaEmbeddings(config) + + +# الرؤوس الآن تحتاج فقط إلى إعادة تعريف النموذج داخل `RobertaModel` الصحيح +class RobertaForMaskedLM(BertForMaskedLM): + def __init__(self, config): + super().__init__(config) + self.model = RobertaModel(config) +``` + +لاحظ أنه إذا لم تستخدم الاعتماد الذي حددته، فستحصل على الخطأ التالي: + +```bash +ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should be used + when you define `BertModel`, as it is one of it's direct dependencies. Make sure + you use it in the `__init__` function. +``` + +بالإضافة إلى ذلك، قد تجد قائمة بالأمثلة هنا: + +## ما هو ليس كذلك + +ليس بديلاً لتعليمات برمجة النمذجة (بعد؟)، وإذا لم يكن نموذجك يعتمد على أي شيء آخر موجود من قبل، فيمكنك إضافة ملف `نمذجة` كالعادة. + + +## الاستخدام المتقدم + +### إزالة السمات والوظائف +لإزالة السمات التي لا تستخدم في نموذجك النمطي، والتي لا تريد رؤيتها في النمذجة المفككة: + +```python +class GemmaModel(LlamaModel): | class GemmaModel(PreTrainedModel): + def __init__(self, config): | def __init__(self, config): + super().__init__(self, eos_token) | super().__init__(config) + del self.embed_tokens | self.padding_idx = config.pad_token_id + | self.vocab_size = config.vocab_size + | + | self.layers = nn.ModuleList( + | [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + | ) + | self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + | self.rotary_emb = LlamaRotaryEmbedding(config=config) + | self.gradient_checkpointing = False + | + | # Initialize weights and apply final processing + | self.post_init() +``` +إذا قمت بالتحقق من `LlamaModel` الأصلي، فستجد `embed_tokens` الذي تمت إزالته هنا (كما هو متوقع!) + +إزالة وظيفة مشابهة، تحتاج فقط إلى كتابتها مع `raise ValueError("")` لمحاكاة السلوك الذي تريده فعليًا عند إزالة وظيفة أصلية في بايثون. + +```python +class GemmaTokenizer(LlamaTokenizer): + ... + + def get_spm_processor(self): + raise AttributeError("Not needed for Gemma") + + def unk_token_length(self): + raise AttributeError("Not needed for Gemma") +``` + +### تعريف وظائف جديدة + +إذا قمت بتعريف وظيفة جديدة في الملف `modular` لاستخدامها داخل فئة، على سبيل المثال + +```python +def my_new_function(*args, **kwargs): + # Do something here + pass + +class GemmaModel(LlamaModel): + def forward(*args, **kwargs): + # Call the function + example = my_new_function(*args, **kwargs) + # continue here +``` + +سيتم نسخ وظيفة `my_new_function` (وبشكل متكرر، أي وظائف أخرى جديدة يتم استدعاؤها في جسمها) تلقائيًا +في الملف الذي يتم استخدامه. + +### استدعاء `super()` +قمنا مؤخرًا بشحن بعض الميزات التي تسمح لك بالانتقال من: +```python +class GemmaTokenizer(LlamaTokenizer, PretrainedTokenizerFast): | class GemmaModel(nn.Module): + def __init__(self, eos_token=""): | def __init__(self): + eos_token = AddedToken(eos_token) | eos_token = AddedToken(eos_token) + PretrainedTokenizerFast.__init__(self, eos_token) | super().__init__(eos_token) +``` +هذا مفيد عندما لا تريد تفكيك استدعاء `super()`، وتريد التمييز بين أي استدعاء super init تقوم به! + +### التسمية الخاصة +ندعم الآن أيضًا حالات خاصة مثل +```python +class GemmaVisionModel(CLIPModel): + pass +``` +حيث اسم فئة `GemmaVision` الخاصة بك ليس هو نفسه `Gemma` النمطي. هذا مفيد للغاية للنماذج المركبة. diff --git a/docs/source/ar/tiktoken.md b/docs/source/ar/tiktoken.md new file mode 100644 index 00000000000000..6f3755d8670cdc --- /dev/null +++ b/docs/source/ar/tiktoken.md @@ -0,0 +1,41 @@ +# Tiktoken والتفاعل مع Transformers + +يتم دمج دعم ملفات نموذج tiktoken بسلاسة في 🤗 transformers عند تحميل النماذج +`from_pretrained` مع ملف `tokenizer.model` tiktoken على Hub، والذي يتم تحويله تلقائيًا إلى [المحلل اللغوي السريع](https://huggingface.co/docs/transformers/main/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast). + +### النماذج المعروفة التي تم إصدارها مع `tiktoken.model`: + - gpt2 + - llama3 + +## مثال على الاستخدام + +من أجل تحميل ملفات `tiktoken` في `transformers`، تأكد من أن ملف `tokenizer.model` هو ملف tiktoken وسيتم تحميله تلقائيًا عند التحميل `from_pretrained`. إليك كيفية تحميل مجزىء لغوي ونموذج، والذي +يمكن تحميله من نفس الملف بالضبط: + +```py +from transformers import AutoTokenizer + +model_id = "meta-llama/Meta-Llama-3-8B-Instruct" +tokenizer = AutoTokenizer.from_pretrained(model_id, subfolder="original") +``` +## إنشاء مجزىء لغوي tiktoken + +لا يحتوي ملف `tokenizer.model` على أي معلومات حول الرموز أو الأنماط الإضافية. إذا كانت هذه الأمور مهمة، قم بتحويل المحلل اللغوي إلى `tokenizer.json`، وهو التنسيق المناسب لـ [`PreTrainedTokenizerFast`]. + +قم بتوليد ملف `tokenizer.model` باستخدام [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) ثم قم بتحويله إلى `tokenizer.json` باستخدام [`convert_tiktoken_to_fast`]. + +```py + +from transformers.integrations.tiktoken import convert_tiktoken_to_fast +from tiktoken import get_encoding + +# يمكنك تحميل ترميزك المخصص أو الترميز الذي توفره OpenAI +encoding = get_encoding("gpt2") +convert_tiktoken_to_fast(encoding, "config/save/dir") +``` + +يتم حفظ ملف `tokenizer.json` الناتج في الدليل المحدد ويمكن تحميله باستخدام [`PreTrainedTokenizerFast`]. + +```py +tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir") +``` From 52d135426f387862db8c73ea5ab8f69338ee81c7 Mon Sep 17 00:00:00 2001 From: Henry Hyeonmok Ko <52618631+henryhmko@users.noreply.github.com> Date: Tue, 10 Dec 2024 09:08:55 -0800 Subject: [PATCH 25/62] Multiple typo fixes in NLP, Audio docs (#35181) Fixed multiple typos in Tutorials, NLP, and Audio sections --- docs/source/en/tasks/asr.md | 2 +- docs/source/en/tasks/multiple_choice.md | 4 ++-- docs/source/en/tasks/question_answering.md | 4 ++-- docs/source/en/tasks/summarization.md | 2 +- docs/source/en/tasks/translation.md | 2 +- src/transformers/integrations/peft.py | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/en/tasks/asr.md b/docs/source/en/tasks/asr.md index f3e068444ca556..87b8f024420ce6 100644 --- a/docs/source/en/tasks/asr.md +++ b/docs/source/en/tasks/asr.md @@ -112,7 +112,7 @@ The next step is to load a Wav2Vec2 processor to process the audio signal: >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base") ``` -The MInDS-14 dataset has a sampling rate of 8000kHz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000kHz to use the pretrained Wav2Vec2 model: +The MInDS-14 dataset has a sampling rate of 8000Hz (you can find this information in its [dataset card](https://huggingface.co/datasets/PolyAI/minds14)), which means you'll need to resample the dataset to 16000Hz to use the pretrained Wav2Vec2 model: ```py >>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000)) diff --git a/docs/source/en/tasks/multiple_choice.md b/docs/source/en/tasks/multiple_choice.md index 06eb45eda99150..18b12f2166637e 100644 --- a/docs/source/en/tasks/multiple_choice.md +++ b/docs/source/en/tasks/multiple_choice.md @@ -419,7 +419,7 @@ Get the class with the highest probability: ```py >>> predicted_class = logits.argmax().item() >>> predicted_class -'0' +0 ``` @@ -448,7 +448,7 @@ Get the class with the highest probability: ```py >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) >>> predicted_class -'0' +0 ``` diff --git a/docs/source/en/tasks/question_answering.md b/docs/source/en/tasks/question_answering.md index 998010e67ca95f..41d7fd48cf816e 100644 --- a/docs/source/en/tasks/question_answering.md +++ b/docs/source/en/tasks/question_answering.md @@ -325,7 +325,7 @@ or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/no Evaluation for question answering requires a significant amount of postprocessing. To avoid taking up too much of your time, this guide skips the evaluation step. The [`Trainer`] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. -If have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! +If you have more time and you're interested in how to evaluate your model for question answering, take a look at the [Question answering](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) chapter from the 🤗 Hugging Face Course! ## Inference @@ -397,7 +397,7 @@ Tokenize the text and return TensorFlow tensors: >>> from transformers import AutoTokenizer >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") ->>> inputs = tokenizer(question, text, return_tensors="tf") +>>> inputs = tokenizer(question, context, return_tensors="tf") ``` Pass your inputs to the model and return the `logits`: diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md index 7d7ecf1fbab6db..e16dd17dfe1fc8 100644 --- a/docs/source/en/tasks/summarization.md +++ b/docs/source/en/tasks/summarization.md @@ -283,7 +283,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md index 426ba1c340fb81..922cdc7241176a 100644 --- a/docs/source/en/tasks/translation.md +++ b/docs/source/en/tasks/translation.md @@ -290,7 +290,7 @@ Pass your `compute_metrics` function to [`~transformers.KerasMetricCallback`]: ```py >>> from transformers.keras_callbacks import KerasMetricCallback ->>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) +>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set) ``` Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]: diff --git a/src/transformers/integrations/peft.py b/src/transformers/integrations/peft.py index ef09281431169f..69e674a2160643 100644 --- a/src/transformers/integrations/peft.py +++ b/src/transformers/integrations/peft.py @@ -108,7 +108,7 @@ def load_adapter( token (`str`, `optional`): - Whether to use authentication token to load the remote folder. Userful to load private repositories + Whether to use authentication token to load the remote folder. Useful to load private repositories that are on HuggingFace Hub. You might need to call `huggingface-cli login` and paste your tokens to cache it. device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): From 217c47e31bc0cd442443e5b4a62c8bc2785d53ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ga=C3=A9tan=20Lepage?= <33058747+GaetanLepage@users.noreply.github.com> Date: Tue, 10 Dec 2024 18:19:30 +0100 Subject: [PATCH 26/62] Only import torch.distributed if it is available (#35133) --- src/transformers/pytorch_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py index 5bdf8a355ddfaa..fab1b9118d18d3 100644 --- a/src/transformers/pytorch_utils.py +++ b/src/transformers/pytorch_utils.py @@ -38,8 +38,10 @@ is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13") is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12") +# Cache this result has it's a C FFI call which can be pretty time-consuming +_torch_distributed_available = torch.distributed.is_available() -if is_torch_greater_or_equal("2.5"): +if is_torch_greater_or_equal("2.5") and _torch_distributed_available: from torch.distributed.tensor import Replicate from torch.distributed.tensor.parallel import ( ColwiseParallel, From 91b8ab18b778ae9e2f8191866e018cd1dc7097be Mon Sep 17 00:00:00 2001 From: French_Ball <127096560+asdkfjsd@users.noreply.github.com> Date: Wed, 11 Dec 2024 01:58:47 +0800 Subject: [PATCH 27/62] [i18n-] Translating Benchmarks.md to Chinese (#35137) * add "Translating Benchmarks.md to Chinese " * Removed all the English original text (which was previously kept as comments in the document) and refined some of the Chinese expressions. --- docs/source/zh/_toctree.yml | 2 + docs/source/zh/benchmarks.md | 377 +++++++++++++++++++++++++++++++++++ 2 files changed, 379 insertions(+) create mode 100644 docs/source/zh/benchmarks.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index d4863efde710ea..c4c5890ed0b3f4 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -50,6 +50,8 @@ title: 导出为 TFLite - local: torchscript title: 导出为 TorchScript + - local: benchmarks + title: 对模型进行基准测试 - local: gguf title: 与 GGUF 格式的互操作性 - local: tiktoken diff --git a/docs/source/zh/benchmarks.md b/docs/source/zh/benchmarks.md new file mode 100644 index 00000000000000..2e9787c9a3bb6b --- /dev/null +++ b/docs/source/zh/benchmarks.md @@ -0,0 +1,377 @@ + + +# 基准测试 + + + +小提示:Hugging Face的基准测试工具已经不再更新,建议使用外部基准测试库来衡量Transformer模 +型的速度和内存复杂度。 + + + +[[open-in-colab]] + +让我们来看看如何对🤗 Transformers模型进行基准测试,以及进行测试的推荐策略和已有的基准测试结果。 + +如果您需要更详细的回答,可以在[这里](https://github.com/huggingface/notebooks/tree/main/examples/benchmark.ipynb)找到更多关于基准测试的内容。 + + +## 如何对🤗 Transformers模型进行基准测试 + +使用[`PyTorchBenchmark`]和[`TensorFlowBenchmark`]类可以灵活地对🤗 Transformers模型进行基准测试。这些基准测试类可以衡量模型在**推理**和**训练**过程中所需的**峰值内存**和**时间**。 + + + +这里的**推理**指的是一次前向传播(forward pass),而训练则指一次前向传播和反向传播(backward pass)。 + + + + +基准测试类 [`PyTorchBenchmark`] 和 [`TensorFlowBenchmark`] 需要分别传入 [`PyTorchBenchmarkArguments`] 和 [`TensorFlowBenchmarkArguments`] 类型的对象来进行实例化。这些类是数据类型,包含了所有相关的配置参数,用于其对应的基准测试类。 + +在下面的示例中,我们展示了如何对类型为 **bert-base-cased** 的BERT模型进行基准测试: + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments + +>>> args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +>>> benchmark = PyTorchBenchmark(args) +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments + +>>> args = TensorFlowBenchmarkArguments( +... models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> benchmark = TensorFlowBenchmark(args) +``` + + + +在这里,基准测试的参数数据类接受了三个主要的参数,即 `models`、`batch_sizes` 和`sequence_lengths`。其中,`models` 是必需的参数,它期望一个来自[模型库](https://huggingface.co/models)的模型标识符列表。`batch_sizes` 和 `sequence_lengths` 是列表类型的参数,定义了进行基准测试时 `input_ids` 的批量大小和序列长度。 + +这些是基准测试数据类中可以配置的一些主要参数。除此之外,基准测试数据类中还可以配置很多其他参数。如需要查看更详细的配置参数,可以直接查看以下文件: + +* `src/transformers/benchmark/benchmark_args_utils.py` +* `src/transformers/benchmark/benchmark_args.py`(针对 PyTorch) +* `src/transformers/benchmark/benchmark_args_tf.py`(针对 TensorFlow) + +另外,您还可以通过在根目录下运行以下命令,查看针对 PyTorch 和 TensorFlow 的所有可配置参数的描述列表: +``` bash python examples/pytorch/benchmarking/run_benchmark.py --help ``` +这些命令将列出所有可以配置的参数,它们可以帮助您更加灵活地进行基准测试。 + + + + + + +以下代码通过`PyTorchBenchmarkArguments`设置模型批处理大小和序列长度,然后调用`benchmark.run()`执行基准测试。 + +```py +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 08:58:43.371351 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help +``` + +接下来,只需要调用 `benchmark.run()` 就能轻松运行已经实例化的基准测试对象。 + +```py +>>> results = benchmark.run() +>>> print(results) +>>> results = benchmark.run() +>>> print(results) +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:26:35.617317 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + + + +在一般情况下,基准测试会测量推理(inference)的**时间**和**所需内存**。在上面的示例输出中,前两部分显示了与**推理时间**和**推理内存**对应的结果。与此同时,关于计算环境的所有相关信息(例如 GPU 类型、系统、库版本等)会在第三部分的**环境信息**中打印出来。你可以通过在 [`PyTorchBenchmarkArguments`] 和 [`TensorFlowBenchmarkArguments`] 中添加 `save_to_csv=True`参数,将这些信息保存到一个 .csv 文件中。在这种情况下,每一部分的信息会分别保存在不同的 .csv 文件中。每个 .csv 文件的路径也可以通过参数数据类进行定义。 + + +您可以选择不通过预训练模型的模型标识符(如 `google-bert/bert-base-uncased`)进行基准测试,而是对任何可用模型类的任意配置进行基准测试。在这种情况下,我们必须将一系列配置与基准测试参数一起传入,方法如下: + + + +```py +>>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig + +>>> args = PyTorchBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1277 +bert-base 8 32 1281 +bert-base 8 128 1307 +bert-base 8 512 1539 +bert-384-hid 8 8 1005 +bert-384-hid 8 32 1027 +bert-384-hid 8 128 1035 +bert-384-hid 8 512 1255 +bert-6-lay 8 8 1097 +bert-6-lay 8 32 1101 +bert-6-lay 8 128 1127 +bert-6-lay 8 512 1359 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: PyTorch +- use_torchscript: False +- framework_version: 1.4.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:35:25.143267 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + +```py +>>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +>>> args = TensorFlowBenchmarkArguments( +... models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] +... ) +>>> config_base = BertConfig() +>>> config_384_hid = BertConfig(hidden_size=384) +>>> config_6_lay = BertConfig(num_hidden_layers=6) + +>>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +>>> benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Time in s +-------------------------------------------------------------------------------- +bert-base 8 8 0.005 +bert-base 8 32 0.008 +bert-base 8 128 0.022 +bert-base 8 512 0.106 +bert-384-hid 8 8 0.005 +bert-384-hid 8 32 0.007 +bert-384-hid 8 128 0.018 +bert-384-hid 8 512 0.064 +bert-6-lay 8 8 0.002 +bert-6-lay 8 32 0.003 +bert-6-lay 8 128 0.0011 +bert-6-lay 8 512 0.074 +-------------------------------------------------------------------------------- + +==================== INFERENCE - MEMORY - RESULT ==================== +-------------------------------------------------------------------------------- +Model Name Batch Size Seq Length Memory in MB +-------------------------------------------------------------------------------- +bert-base 8 8 1330 +bert-base 8 32 1330 +bert-base 8 128 1330 +bert-base 8 512 1770 +bert-384-hid 8 8 1330 +bert-384-hid 8 32 1330 +bert-384-hid 8 128 1330 +bert-384-hid 8 512 1540 +bert-6-lay 8 8 1330 +bert-6-lay 8 32 1330 +bert-6-lay 8 128 1330 +bert-6-lay 8 512 1540 +-------------------------------------------------------------------------------- + +==================== ENVIRONMENT INFORMATION ==================== + +- transformers_version: 2.11.0 +- framework: Tensorflow +- use_xla: False +- framework_version: 2.2.0 +- python_version: 3.6.10 +- system: Linux +- cpu: x86_64 +- architecture: 64bit +- date: 2020-06-29 +- time: 09:38:15.487125 +- fp16: False +- use_multiprocessing: True +- only_pretrain_model: False +- cpu_ram_mb: 32088 +- use_gpu: True +- num_gpus: 1 +- gpu: TITAN RTX +- gpu_ram_mb: 24217 +- gpu_power_watts: 280.0 +- gpu_performance_state: 2 +- use_tpu: False +``` + + + + + **推理时间**和**推理所需内存**会被重新测量,不过这次是针对 `BertModel` 类的自定义配置进行基准测试。这个功能在决定模型应该使用哪种配置进行训练时尤其有用。 + + +## 基准测试的推荐策略 +本节列出了一些在对模型进行基准测试时比较推荐的策略: + +* 目前,该模块只支持单设备基准测试。在进行 GPU 基准测试时,建议用户通过设置 `CUDA_VISIBLE_DEVICES` 环境变量来指定代码应在哪个设备上运行,例如在运行代码前执行 `export CUDA_VISIBLE_DEVICES=0`。 +* `no_multi_processing` 选项仅应在测试和调试时设置为 `True`。为了确保内存测量的准确性,建议将每个内存基准测试单独运行在一个进程中,并确保 `no_multi_processing` 设置为 `True`。 +* 当您分享模型基准测试结果时,应始终提供环境信息。由于 GPU 设备、库版本等之间可能存在较大差异,单独的基准测试结果对社区的帮助有限。 + + +## 分享您的基准测试结果 + +先前的所有可用的核心模型(当时有10个)都已针对 **推理时间** 进行基准测试,涵盖了多种不同的设置:使用 PyTorch(包不包含 TorchScript),使用 TensorFlow(包不包含 XLA)。所有的测试都在 CPU(除了 TensorFlow XLA)和 GPU 上进行。 + +这种方法的详细信息可以在 [这篇博客](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2) 中找到,测试结果可以在 [这里](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing) 查看。 + + +您可以借助新的 **基准测试** 工具比以往任何时候都更容易地分享您的基准测试结果! + +- [PyTorch 基准测试结果](https://github.com/huggingface/transformers/tree/main/examples/pytorch/benchmarking/README.md) +- [TensorFlow 基准测试结果](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/benchmarking/README.md) + + From 5290f6a62dd4892b07ff290087591d8a625a9f1f Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 10 Dec 2024 11:36:25 -0800 Subject: [PATCH 28/62] [docs] Fix FlashAttention link (#35171) fix link --- docs/source/en/model_doc/idefics2.md | 2 +- docs/source/en/model_doc/llava_next_video.md | 2 +- docs/source/en/model_doc/mistral.md | 2 +- docs/source/en/model_doc/mixtral.md | 2 +- docs/source/en/model_doc/video_llava.md | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md index 5ad56b7b5c525d..b9b51082f29e5b 100644 --- a/docs/source/en/model_doc/idefics2.md +++ b/docs/source/en/model_doc/idefics2.md @@ -141,7 +141,7 @@ Do note that when training Idefics2 on multi-turn conversations between a user a ## Model optimizations: Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/llava_next_video.md b/docs/source/en/model_doc/llava_next_video.md index f8a149f12b6779..cc3a61aae6c736 100644 --- a/docs/source/en/model_doc/llava_next_video.md +++ b/docs/source/en/model_doc/llava_next_video.md @@ -240,7 +240,7 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained("llava-hf/LLaVA-N ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md index 2be657109a8d46..cfa2af3678137a 100644 --- a/docs/source/en/model_doc/mistral.md +++ b/docs/source/en/model_doc/mistral.md @@ -91,7 +91,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mistral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md index 7afcaa798ecac4..b5451702e44a16 100644 --- a/docs/source/en/model_doc/mixtral.md +++ b/docs/source/en/model_doc/mixtral.md @@ -93,7 +93,7 @@ As can be seen, the instruction-tuned model requires a [chat template](../chat_t ## Speeding up Mixtral by using Flash Attention -The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +The code snippets above showcase inference without any optimization tricks. However, one can drastically speed up the model by leveraging [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md index 105307196effd0..a3ba1258ecfa06 100644 --- a/docs/source/en/model_doc/video_llava.md +++ b/docs/source/en/model_doc/video_llava.md @@ -174,7 +174,7 @@ model = VideoLlavaForConditionalGeneration.from_pretrained("LanguageBind/Video-L ### Flash-Attention 2 to speed-up generation -Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one.md#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. +Additionally, we can greatly speed-up model inference by using [Flash Attention](../perf_train_gpu_one#flash-attention-2), which is a faster implementation of the attention mechanism used inside the model. First, make sure to install the latest version of Flash Attention 2: From e8508924fdfe1ad96e35d23e795eb7f7beec836a Mon Sep 17 00:00:00 2001 From: John Graham Reynolds Date: Tue, 10 Dec 2024 18:16:01 -0500 Subject: [PATCH 29/62] Update data collator docstrings to accurately reference Nvidia tensor core compute capability version (#35188) update data collator docs to reflect correct tensor core compute capability Co-authored-by: John Graham Reynolds --- src/transformers/data/data_collator.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/transformers/data/data_collator.py b/src/transformers/data/data_collator.py index 9e75e6fd3c38df..e84c9d0ef3ce2e 100644 --- a/src/transformers/data/data_collator.py +++ b/src/transformers/data/data_collator.py @@ -256,7 +256,7 @@ class DataCollatorWithPadding: If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). return_tensors (`str`, *optional*, defaults to `"pt"`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". """ @@ -308,7 +308,7 @@ class DataCollatorForTokenClassification(DataCollatorMixin): If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignore by PyTorch loss functions). return_tensors (`str`, *optional*, defaults to `"pt"`): @@ -568,7 +568,7 @@ class DataCollatorForSeq2Seq: If set will pad the sequence to a multiple of the provided value. This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= - 7.5 (Volta). + 7.0 (Volta). label_pad_token_id (`int`, *optional*, defaults to -100): The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions). return_tensors (`str`, *optional*, defaults to `"pt"`): @@ -693,6 +693,9 @@ class DataCollatorForLanguageModeling(DataCollatorMixin): The probability with which to (randomly) mask tokens in the input, when `mlm` is set to `True`. pad_to_multiple_of (`int`, *optional*): If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >= + 7.0 (Volta). return_tensors (`str`): The type of Tensor to return. Allowable values are "np", "pt" and "tf". From 10feacd88aef9569e240b7e3833ab32b297e4460 Mon Sep 17 00:00:00 2001 From: HMJ0628 <2383422508@qq.com> Date: Wed, 11 Dec 2024 07:16:37 +0800 Subject: [PATCH 30/62] [i18n-] Translating agents.md to Chinese (#35139) * add "translate agents.md" * add "agents.md" * add "translate warnings" * add "totree" * add "remove transformer_agent" * add "remove transformer _agent file" --------- Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> --- docs/source/zh/_toctree.yml | 6 +- docs/source/zh/agents.md | 427 ++++++++++++++++++++++++++ docs/source/zh/transformers_agents.md | 285 ----------------- 3 files changed, 430 insertions(+), 288 deletions(-) create mode 100644 docs/source/zh/agents.md delete mode 100644 docs/source/zh/transformers_agents.md diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml index c4c5890ed0b3f4..bd0cc7c7f7f97d 100644 --- a/docs/source/zh/_toctree.yml +++ b/docs/source/zh/_toctree.yml @@ -23,8 +23,8 @@ title: 使用🤗 PEFT加载和训练adapters - local: model_sharing title: 分享您的模型 - - local: transformers_agents - title: agents教程 + - local: agents + title: 智能体和工具 - local: llm_tutorial title: 使用LLMs进行生成 title: 教程 @@ -102,7 +102,7 @@ - sections: - sections: - local: main_classes/agent - title: Agents和工具 + title: 智能体和工具 - local: main_classes/callback title: Callbacks - local: main_classes/configuration diff --git a/docs/source/zh/agents.md b/docs/source/zh/agents.md new file mode 100644 index 00000000000000..00fa74e6545025 --- /dev/null +++ b/docs/source/zh/agents.md @@ -0,0 +1,427 @@ + +# 智能体和工具 + +[[在colab里打开]] + +### 什么是智能体 (Agent)? + +大型语言模型(LLM)经过 [因果语言建模训练](./tasks/language_modeling) 可以应对各种任务,但在一些基本任务(如逻辑推理、计算和搜索)上常常表现不佳。当它们被用在自己不擅长的领域时,往往无法生成我们期望的答案。 + +为了解决这个问题,可以创建**智能体**. + +智能体是一个系统,它使用 LLM 作为引擎,并且能够访问称为**工具**的功能。 + +这些**工具**是执行任务的函数,包含所有必要的描述信息,帮助智能体正确使用它们。 + +智能体可以被编程为: +- 一次性设计一系列工具并同时执行它们,像 [`CodeAgent`] +- 一次执行一个工具,并等待每个工具的结果后再启动下一个,像 [`ReactJsonAgent`] + +### 智能体类型 + +#### 代码智能体 + +此智能体包含一个规划步骤,然后生成 Python 代码一次性执行所有任务。它原生支持处理不同输入和输出类型,因此推荐用于多模态任务。 + +#### 推理智能体 + +这是解决推理任务的首选代理,因为 ReAct 框架 ([Yao et al., 2022](https://huggingface.co/papers/2210.03629)) 使其在基于之前观察进行推理时非常高效。 + +我们实现了两种版本的 ReactJsonAgent: +- [`ReactJsonAgent`] 将工具调用作为 JSON 格式输出。 +- [`ReactCodeAgent`] 是 ReactJsonAgent 的一种新型,生成工具调用的代码块,对于具备强大编程能力的 LLM 非常适用。 + +> [TIP] +> 阅读 [Open-source LLMs as LangChain Agents](https://huggingface.co/blog/open-source-llms-as-agents) 博文,了解更多关于推理智能体的信息。 + +
+ + +
+ +![推理智能体的框架](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/open-source-llms-as-agents/ReAct.png) + +以下是一个推理代码智能体如何处理以下问题的示例: + +```py3 +>>> agent.run( +... "How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?", +... ) +=====New task===== +How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need? +====Agent is executing the code below: +bert_blocks = search(query="number of blocks in BERT base encoder") +print("BERT blocks:", bert_blocks) +==== +Print outputs: +BERT blocks: twelve encoder blocks + +====Agent is executing the code below: +attention_layer = search(query="number of layers in Attention is All You Need") +print("Attention layers:", attention_layer) +==== +Print outputs: +Attention layers: Encoder: The encoder is composed of a stack of N = 6 identical layers. Each layer has two sub-layers. The first is a multi-head self-attention mechanism, and the second is a simple, position- 2 Page 3 Figure 1: The Transformer - model architecture. + +====Agent is executing the code below: +bert_blocks = 12 +attention_layers = 6 +diff = bert_blocks - attention_layers +print("Difference in blocks:", diff) +final_answer(diff) +==== + +Print outputs: +Difference in blocks: 6 + +Final answer: 6 +``` + +### 如何构建智能体? + +要初始化一个智能体,您需要以下参数: + +- **一个 LLM** 来驱动智能体——智能体本身并不是 LLM,而是一个使用 LLM 作为引擎的程序。 +- **一个系统提示**:告诉 LLM 引擎应该如何生成输出。 +- **一个工具箱**,智能体可以从中选择工具执行。 +- **一个解析器**,从 LLM 输出中提取出哪些工具需要调用,以及使用哪些参数。 + +在智能体系统初始化时,工具属性将生成工具描述,并嵌入到智能体的系统提示中,告知智能体可以使用哪些工具,并且为什么使用它们。 + +**安装依赖** + +首先,您需要安装**智能体**所需的额外依赖: + +```bash +pip install transformers[agents] +``` +**创建LLM引擎** + +定义一个 `llm_engine` 方法,该方法接受一系列[消息](./chat_templating)并返回文本。该 `callable` 还需要接受一个 `stop` 参数,用于指示何时停止生成输出。 + +```python +from huggingface_hub import login, InferenceClient + +login("") + +client = InferenceClient(model="meta-llama/Meta-Llama-3-70B-Instruct") + +def llm_engine(messages, stop_sequences=["Task"]) -> str: + response = client.chat_completion(messages, stop=stop_sequences, max_tokens=1000) + answer = response.choices[0].message.content + return answer +``` + +您可以使用任何符合以下要求的 `llm_engine` 方法: +1. [输入格式](./chat_templating)为 (`List[Dict[str, str]]`),并且返回一个字符串。 +2. 它在 `stop_sequences` 参数传递的序列处停止生成输出。 + +此外,`llm_engine` 还可以接受一个 `grammar` 参数。如果在智能体初始化时指定了 `grammar`,则该参数将传递给 `llm_engine` 的调用,以允许[受限生成](https://huggingface.co/docs/text-generation-inference/conceptual/guidance),以强制生成格式正确的智能体输出。 + +您还需要一个 `tools` 参数,它接受一个 `Tools` 列表 —— 可以是空列表。您也可以通过定义可选参数 `add_base_tools=True` 来将默认工具箱添加到工具列表中。 + +现在,您可以创建一个智能体,例如 [`CodeAgent`],并运行它。您还可以创建一个 [`TransformersEngine`],使用 `transformers` 在本地机器上运行预初始化的推理管道。 为了方便起见,由于智能体行为通常需要更强大的模型,例如 `Llama-3.1-70B-Instruct`,它们目前较难在本地运行,我们还提供了 [`HfApiEngine`] 类,它在底层初始化了一个 `huggingface_hub.InferenceClient`。 + +```python +from transformers import CodeAgent, HfApiEngine + +llm_engine = HfApiEngine(model="meta-llama/Meta-Llama-3-70B-Instruct") +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and return the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +当你急需某个东西时这将会很有用! +您甚至可以将 `llm_engine` 参数留空,默认情况下会创建一个 [`HfApiEngine`]。 + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], add_base_tools=True) + +agent.run( + "Could you translate this sentence from French, say it out loud and give me the audio.", + sentence="Où est la boulangerie la plus proche?", +) +``` + +请注意,我们使用了额外的 `sentence` 参数:您可以将文本作为附加参数传递给模型。 + +您还可以使用这个来指定本地或远程文件的路径供模型使用: + +```py +from transformers import ReactCodeAgent + +agent = ReactCodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) + +agent.run("Why does Mike not know many people in New York?", audio="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/recording.mp3") +``` + +系统提示和输出解析器会自动定义,但您可以通过调用智能体的 `system_prompt_template` 来轻松查看它们。 + +```python +print(agent.system_prompt_template) +``` + +尽可能清楚地解释您要执行的任务非常重要。 每次 [`~Agent.run`] 操作都是独立的,并且由于智能体是由 LLM 驱动的,提示中的细微变化可能会导致完全不同的结果。 +您还可以连续运行多个任务,每次都会重新初始化智能体的 `agent.task` 和 `agent.logs` 属性。 + + +#### 代码执行 + +Python 解释器在一组输入和工具上执行代码。 这应该是安全的,因为只能调用您提供的工具(特别是 Hugging Face 的工具)和 print 函数,因此您已经限制了可以执行的操作。 + +Python 解释器默认不允许导入不在安全列表中的模块,因此大多数明显的攻击问题应该不成问题。 您仍然可以通过在 [`ReactCodeAgent`] 或 [`CodeAgent`] 初始化时通过 `additional_authorized_imports` 参数传递一个授权的模块列表来授权额外的导入: + +```py +>>> from transformers import ReactCodeAgent + +>>> agent = ReactCodeAgent(tools=[], additional_authorized_imports=['requests', 'bs4']) +>>> agent.run("Could you get me the title of the page at url 'https://huggingface.co/blog'?") + +(...) +'Hugging Face – Blog' +``` + +如果有任何代码尝试执行非法操作,或者生成的代码出现常规 Python 错误,执行将停止。 + +> [!WARNING] +> 在使用大语言模型(LLM)生成代码时,生成的代码会被执行,避免导入或使用任何不安全的库或模块。 + +### 系统提示 + +智能体,或者说驱动智能体的 LLM,根据系统提示生成输出。系统提示可以定制并根据目标任务进行调整。例如,检查 [`ReactCodeAgent`] 的系统提示(以下版本经过简化)。 + +```text +You will be given a task to solve as best you can. +You have access to the following tools: +<> + +To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences. + +At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task, then the tools that you want to use. +Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '/End code' sequence. +During each intermediate step, you can use 'print()' to save whatever important information you will then need. +These print outputs will then be available in the 'Observation:' field, for using this information as input for the next step. + +In the end you have to return a final answer using the `final_answer` tool. + +Here are a few examples using notional tools: +--- +{examples} + +Above example were using notional tools that might not exist for you. You only have acces to those tools: +<> +You also can perform computations in the python code you generate. + +Always provide a 'Thought:' and a 'Code:\n```py' sequence ending with '```' sequence. You MUST provide at least the 'Code:' sequence to move forward. + +Remember to not perform too many operations in a single code block! You should split the task into intermediate code blocks. +Print results at the end of each step to save the intermediate results. Then use final_answer() to return the final result. + +Remember to make sure that variables you use are all defined. + +Now Begin! +``` + +系统提示包括: +- 解释智能体应该如何工作以及工具的**介绍**。 +- 所有工具的描述由 `<>` 标记在运行时动态替换,这样智能体就知道可以使用哪些工具及其用途。 + - 工具的描述来自工具的属性,`name`、`description`、`inputs` 和 `output_type`,以及一个简单的 `jinja2` 模板,您可以根据需要进行调整。 +- 期望的输出格式。 + +您可以通过向 `system_prompt` 参数传递自定义提示来最大程度地提高灵活性,从而覆盖整个系统提示模板。 + +```python +from transformers import ReactJsonAgent +from transformers.agents import PythonInterpreterTool + +agent = ReactJsonAgent(tools=[PythonInterpreterTool()], system_prompt="{your_custom_prompt}") +``` + +> [WARNING] +> 必须在`template`中定义 `<>` 这个变量,以便智能体能够正确地识别并使用可用的工具 + + +### 检查智能体的运行 + +以下是检查运行后发生了什么的一些有用属性: +- `agent.logs` 存储了智能体的详细日志。每一步的所有内容都会存储在一个字典中,然后附加到 `agent.logs`。 +- 运行 `agent.write_inner_memory_from_logs()` 会从日志中创建智能体的内存,以便 LLM 查看,作为一系列聊天消息。此方法会遍历日志的每个步骤,只保存其感兴趣的消息:例如,它会单独保存系统提示和任务,然后为每个步骤保存 LLM 输出的消息,以及工具调用输出的消息。如果您想要更高层次的查看发生了什么,可以使用此方法 —— 但并不是每个日志都会被此方法转录。 + +## 工具 + +工具是智能体使用的基本功能。 + +例如,您可以检查 [`PythonInterpreterTool`]:它有一个名称、描述、输入描述、输出类型和 `__call__` 方法来执行该操作。 + +当智能体初始化时,工具属性会用来生成工具描述,然后将其嵌入到智能体的系统提示中,这让智能体知道可以使用哪些工具以及为什么使用它们。 + +### 默认工具箱 + +Transformers 提供了一个默认工具箱,用于增强智能体,您可以在初始化时通过 `add_base_tools=True` 参数将其添加到智能体中: + +- **文档问答**:给定一个文档(如图像格式的 PDF),回答关于该文档的问题([Donut](./model_doc/donut)) +- **图像问答**:给定一张图片,回答关于该图像的问题([VILT](./model_doc/vilt)) +- **语音转文本**:给定一个人讲述的音频录音,将其转录为文本(Whisper) +- **文本转语音**:将文本转换为语音([SpeechT5](./model_doc/speecht5)) +- **翻译**:将给定的句子从源语言翻译为目标语言 +- **DuckDuckGo 搜索**:使用 `DuckDuckGo` 浏览器进行网络搜索 +- **Python 代码解释器**:在安全环境中运行 LLM 生成的 Python 代码。只有在初始化 [`ReactJsonAgent`] 时将 `add_base_tools=True` 时,代码智能体才会添加此工具,因为基于代码的智能体已经能够原生执行 Python 代码 + + +您可以通过调用 [`load_tool`] 函数来手动使用某个工具并执行任务。 + + +```python +from transformers import load_tool + +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") +``` + + +### 创建新工具 + +您可以为 `Hugging Face` 默认工具无法涵盖的用例创建自己的工具。 +例如,假设我们要创建一个返回在 `Hugging Face Hub` 上某个任务中下载次数最多的模型的工具。 + +您将从以下代码开始: + +```python +from huggingface_hub import list_models + +task = "text-classification" + +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) +``` + +这段代码可以很快转换为工具,只需将其包装成一个函数,并添加 `tool` 装饰器: + + +```py +from transformers import tool + +@tool +def model_download_tool(task: str) -> str: + """ + This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. + It returns the name of the checkpoint. + + Args: + task: The task for which + """ + model = next(iter(list_models(filter="text-classification", sort="downloads", direction=-1))) + return model.id +``` + +该函数需要: +- 一个清晰的名称。名称通常描述工具的功能。由于代码返回某个任务中下载次数最多的模型,因此我们将其命名为 `model_download_tool`。 +- 对输入和输出进行类型提示 +- 描述,其中包括 "`Args`:" 部分,描述每个参数(这次不需要类型指示,它会从类型提示中获取)。 + +所有这些将自动嵌入到智能体的系统提示中,因此请尽量使它们尽可能清晰! + +> [TIP] +> 这个定义格式与 apply_chat_template 中使用的工具模式相同,唯一的区别是添加了 tool 装饰器:可以在我们的工具使用 API 中[了解更多](https://huggingface.co/blog/unified-tool-use#passing-tools-to-a-chat-template). + +然后,您可以直接初始化您的智能体: +```py +from transformers import CodeAgent +agent = CodeAgent(tools=[model_download_tool], llm_engine=llm_engine) +agent.run( + "Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" +) +``` + +您将得到以下输出: +```text +======== New task ======== +Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub? +==== Agent is executing the code below: +most_downloaded_model = model_download_tool(task="text-to-video") +print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.") +==== +``` + +输出: +`"The most downloaded model for the 'text-to-video' task is ByteDance/AnimateDiff-Lightning."` + +### 管理智能体的工具箱 + +如果您已经初始化了一个智能体,但想添加一个新的工具,重新初始化智能体会很麻烦。借助 Transformers,您可以通过添加或替换工具来管理智能体的工具箱。 + +让我们将 `model_download_tool` 添加到一个仅初始化了默认工具箱的现有智能体中。 + +```python +from transformers import CodeAgent + +agent = CodeAgent(tools=[], llm_engine=llm_engine, add_base_tools=True) +agent.toolbox.add_tool(model_download_tool) +``` +现在,我们可以同时使用新工具和之前的文本到语音工具: + +```python +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub and return the audio?" +) +``` + + +| **Audio** | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +|