diff --git a/docs/source/ko/_toctree.yml b/docs/source/ko/_toctree.yml index 20519157eddc6d..eaacc998298ff4 100644 --- a/docs/source/ko/_toctree.yml +++ b/docs/source/ko/_toctree.yml @@ -358,8 +358,8 @@ title: (번역중) CodeGen - local: model_doc/cohere title: Cohere - - local: in_translation - title: (번역중) ConvBERT + - local: model_doc/convbert + title: ConvBERT - local: in_translation title: (번역중) CPM - local: in_translation diff --git a/docs/source/ko/model_doc/convbert.md b/docs/source/ko/model_doc/convbert.md new file mode 100644 index 00000000000000..ec64a369b56a3f --- /dev/null +++ b/docs/source/ko/model_doc/convbert.md @@ -0,0 +1,135 @@ + + +# ConvBERT [[convbert]] + +
+ +Models + + +Spaces + +
+ +## 개요 [[overview]] + +ConvBERT 모델은 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan에 의해 제안되었으며, 제안 논문 제목은 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496)입니다. + +논문의 초록은 다음과 같습니다: + +*BERT와 그 변형 모델과 같은 사전 학습된 언어 모델들은 최근 다양한 자연어 이해 과제에서 놀라운 성과를 이루었습니다. 그러나 BERT는 글로벌 셀프 어텐션 블록에 크게 의존하기 때문에 메모리 사용량이 많고 계산 비용이 큽니다. 모든 어텐션 헤드가 글로벌 관점에서 어텐션 맵을 생성하기 위해 입력 시퀀스 전체를 탐색하지만, 일부 헤드는 로컬 종속성만 학습할 필요가 있다는 것을 발견했습니다. 이는 불필요한 계산이 포함되어 있음을 의미합니다. 따라서 우리는 이러한 self-attention 헤드들을 대체하여 로컬 종속성을 직접 모델링하기 위해 새로운 span 기반 동적 컨볼루션을 제안합니다. 새로운 컨볼루션 헤드와 나머지 self-attention 헤드들이 결합하여 글로벌 및 로컬 문맥 학습에 더 효율적인 혼합 어텐션 블록을 구성합니다. 우리는 BERT에 이 혼합 어텐션 설계를 적용하여 ConvBERT 모델을 구축했습니다. 실험 결과, ConvBERT는 다양한 다운스트림 과제에서 BERT 및 그 변형 모델보다 더 우수한 성능을 보였으며, 훈련 비용과 모델 파라미터 수가 더 적었습니다. 특히 ConvBERTbase 모델은 GLUE 스코어 86.4를 달성하여 ELECTRAbase보다 0.7 높은 성과를 보이며, 훈련 비용은 1/4 이하로 줄었습니다. 코드와 사전 학습된 모델은 공개될 예정입니다.* + +이 모델은 [abhishek](https://huggingface.co/abhishek)에 의해 기여되었으며, 원본 구현은 여기에서 찾을 수 있습니다 : https://github.com/yitu-opensource/ConvBert + + + +## 사용 팁 [[usage-tips]] +ConvBERT 훈련 팁은 BERT와 유사합니다. 사용 팁은 [BERT 문서](bert).를 참고하십시오. + + +## 리소스 [[resources]] + +- [텍스트 분류 작업 가이드 (Text classification task guide)](../tasks/sequence_classification) +- [토큰 분류 작업 가이드 (Token classification task guide)](../tasks/token_classification) +- [질의응답 작업 가이드 (Question answering task guide)](../tasks/question_answering) +- [마스킹된 언어 모델링 작업 가이드 (Masked language modeling task guide)](../tasks/masked_language_modeling) +- [다중 선택 작업 가이드 (Multiple choice task guide)](../tasks/multiple_choice) + +## ConvBertConfig [[transformers.ConvBertConfig]] + +[[autodoc]] ConvBertConfig + +## ConvBertTokenizer [[transformers.ConvBertTokenizer]] + +[[autodoc]] ConvBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +## ConvBertTokenizerFast [[transformers.ConvBertTokenizerFast]] + +[[autodoc]] ConvBertTokenizerFast + + + + +## ConvBertModel [[transformers.ConvBertModel]] + +[[autodoc]] ConvBertModel + - forward + +## ConvBertForMaskedLM [[transformers.ConvBertForMaskedLM]] + +[[autodoc]] ConvBertForMaskedLM + - forward + +## ConvBertForSequenceClassification [[transformers.ConvBertForSequenceClassification]] + +[[autodoc]] ConvBertForSequenceClassification + - forward + +## ConvBertForMultipleChoice [[transformers.ConvBertForMultipleChoice]] + +[[autodoc]] ConvBertForMultipleChoice + - forward + +## ConvBertForTokenClassification [[transformers.ConvBertForTokenClassification]] + +[[autodoc]] ConvBertForTokenClassification + - forward + +## ConvBertForQuestionAnswering [[transformers.ConvBertForQuestionAnswering]] + +[[autodoc]] ConvBertForQuestionAnswering + - forward + + + + +## TFConvBertModel [[transformers.TFConvBertModel]] + +[[autodoc]] TFConvBertModel + - call + +## TFConvBertForMaskedLM [[transformers.TFConvBertForMaskedLM]] + +[[autodoc]] TFConvBertForMaskedLM + - call + +## TFConvBertForSequenceClassification [[transformers.TFConvBertForSequenceClassification]] + +[[autodoc]] TFConvBertForSequenceClassification + - call + +## TFConvBertForMultipleChoice [[transformers.TFConvBertForMultipleChoice]] + +[[autodoc]] TFConvBertForMultipleChoice + - call + +## TFConvBertForTokenClassification [[transformers.TFConvBertForTokenClassification]] + +[[autodoc]] TFConvBertForTokenClassification + - call + +## TFConvBertForQuestionAnswering [[transformers.TFConvBertForQuestionAnswering]] + +[[autodoc]] TFConvBertForQuestionAnswering + - call + + + diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py index 3bed494b75c608..8353333ef827ed 100644 --- a/examples/pytorch/contrastive-image-text/run_clip.py +++ b/examples/pytorch/contrastive-image-text/run_clip.py @@ -141,10 +141,6 @@ class DataTrainingArguments: default=None, metadata={"help": "An optional input evaluation data file (a jsonlines file)."}, ) - test_file: Optional[str] = field( - default=None, - metadata={"help": "An optional input testing data file (a jsonlines file)."}, - ) max_seq_length: Optional[int] = field( default=128, metadata={ @@ -190,9 +186,6 @@ def __post_init__(self): if self.validation_file is not None: extension = self.validation_file.split(".")[-1] assert extension in ["csv", "json"], "`validation_file` should be a csv or a json file." - if self.test_file is not None: - extension = self.test_file.split(".")[-1] - assert extension in ["csv", "json"], "`test_file` should be a csv or a json file." dataset_name_mapping = { @@ -315,9 +308,6 @@ def main(): if data_args.validation_file is not None: data_files["validation"] = data_args.validation_file extension = data_args.validation_file.split(".")[-1] - if data_args.test_file is not None: - data_files["test"] = data_args.test_file - extension = data_args.test_file.split(".")[-1] dataset = load_dataset( extension, data_files=data_files, @@ -387,8 +377,6 @@ def _freeze_params(module): column_names = dataset["train"].column_names elif training_args.do_eval: column_names = dataset["validation"].column_names - elif training_args.do_predict: - column_names = dataset["test"].column_names else: logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.") return @@ -490,29 +478,6 @@ def filter_corrupt_images(examples): # Transform images on the fly as doing it on the whole dataset takes too much time. eval_dataset.set_transform(transform_images) - if training_args.do_predict: - if "test" not in dataset: - raise ValueError("--do_predict requires a test dataset") - test_dataset = dataset["test"] - if data_args.max_eval_samples is not None: - max_eval_samples = min(len(test_dataset), data_args.max_eval_samples) - test_dataset = test_dataset.select(range(max_eval_samples)) - - test_dataset = test_dataset.filter( - filter_corrupt_images, batched=True, num_proc=data_args.preprocessing_num_workers - ) - test_dataset = test_dataset.map( - function=tokenize_captions, - batched=True, - num_proc=data_args.preprocessing_num_workers, - remove_columns=[col for col in column_names if col != image_column], - load_from_cache_file=not data_args.overwrite_cache, - desc="Running tokenizer on test dataset", - ) - - # Transform images on the fly as doing it on the whole dataset takes too much time. - test_dataset.set_transform(transform_images) - # 8. Initialize our trainer trainer = Trainer( model=model, diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py index 1d892c49a231fc..60f9f34cf861c9 100755 --- a/src/transformers/configuration_utils.py +++ b/src/transformers/configuration_utils.py @@ -190,6 +190,8 @@ class PretrainedConfig(PushToHubMixin): """ model_type: str = "" + base_config_key: str = "" + sub_configs: Dict[str, "PretrainedConfig"] = {} is_composition: bool = False attribute_map: Dict[str, str] = {} _auto_class: Optional[str] = None @@ -543,11 +545,22 @@ def from_pretrained( cls._set_token_in_kwargs(kwargs, token) config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + if cls.base_config_key and cls.base_config_key in config_dict: + config_dict = config_dict[cls.base_config_key] + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) + # sometimes the config has no `base_config_key` if the config is used in several composite models + # e.g. LlamaConfig. In that case we try to see if there is match in `model_type` before raising a warning + for k, v in config_dict.items(): + if isinstance(v, dict) and v.get("model_type") == cls.model_type: + config_dict = v + + # raise warning only if we still can't see a match in `model_type` + if config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) return cls.from_dict(config_dict, **kwargs) diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index 53cd2df3a49c84..6e6d5b8bdce71d 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1452,11 +1452,10 @@ def _prepare_generated_length( ): generation_config.max_length -= inputs_tensor.shape[1] elif has_default_max_length: # by default let's always generate 20 new tokens - if generation_config.max_length == GenerationConfig().max_length: - generation_config.max_length = generation_config.max_length + input_ids_length - max_position_embeddings = getattr(self.config, "max_position_embeddings", None) - if max_position_embeddings is not None: - generation_config.max_length = min(generation_config.max_length, max_position_embeddings) + generation_config.max_length = generation_config.max_length + input_ids_length + max_position_embeddings = getattr(self.config, "max_position_embeddings", None) + if max_position_embeddings is not None: + generation_config.max_length = min(generation_config.max_length, max_position_embeddings) # same for min length if generation_config.min_new_tokens is not None: diff --git a/src/transformers/modeling_gguf_pytorch_utils.py b/src/transformers/modeling_gguf_pytorch_utils.py index c784ca0eb4ca2c..f58bf330ce7db3 100644 --- a/src/transformers/modeling_gguf_pytorch_utils.py +++ b/src/transformers/modeling_gguf_pytorch_utils.py @@ -106,6 +106,17 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False): if "qwen2moe" in architecture: updated_architecture = "qwen2_moe" + # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors + # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors + # If `use_parallel_residual=False`, ffn_norm will be present in the tensors + if "stablelm" in architecture: + attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"} + ffn_norm_name = "ffn_norm" + qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name) + use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors) + parsed_parameters["config"]["qkv_bias"] = qkv_bias + parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual + model_size = "" # extract the number of params from file name as architectures can differ ; # eg. for falcon : `...falcon-7b-...` diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 2ef4c3615c9fa2..0df59d1db8e05b 100755 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -136,6 +136,7 @@ _init_weights = True +_is_quantized = False def is_fsdp_enabled(): @@ -213,6 +214,16 @@ def _skip_init(*args, **kwargs): setattr(torch.nn.init, name, init_func) +@contextmanager +def set_quantized_state(): + global _is_quantized + _is_quantized = True + try: + yield + finally: + _is_quantized = False + + def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]): try: return next(parameter.parameters()).device @@ -1531,7 +1542,7 @@ def _from_config(cls, config, **kwargs): torch_dtype=torch_dtype, ) - if is_deepspeed_zero3_enabled(): + if is_deepspeed_zero3_enabled() and not _is_quantized: import deepspeed logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") @@ -1597,15 +1608,14 @@ def _autoset_attn_implementation( # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict. # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)` # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238 - for key in config: - if isinstance(getattr(config, key), PretrainedConfig): - sub_config = getattr(config, key) - curr_attn_implementation = ( - requested_attn_implementation - if not isinstance(requested_attn_implementation, dict) - else requested_attn_implementation.get(key, None) - ) - sub_config._attn_implementation_internal = curr_attn_implementation + for key in config.sub_configs.keys(): + sub_config = getattr(config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + sub_config._attn_implementation_internal = curr_attn_implementation if use_flash_attention_2: logger.warning_once( @@ -4086,6 +4096,9 @@ def from_pretrained( ) init_contexts.append(init_empty_weights()) + if is_deepspeed_zero3_enabled() and is_quantized: + init_contexts.append(set_quantized_state()) + config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained. if not getattr(config, "_attn_implementation_autoset", False): config = cls._autoset_attn_implementation( diff --git a/src/transformers/models/align/configuration_align.py b/src/transformers/models/align/configuration_align.py index 99fa81b4a9350d..a22ab1dc40f8d0 100644 --- a/src/transformers/models/align/configuration_align.py +++ b/src/transformers/models/align/configuration_align.py @@ -14,8 +14,7 @@ # limitations under the License. """ALIGN model configuration""" -import os -from typing import TYPE_CHECKING, List, Union +from typing import TYPE_CHECKING, List if TYPE_CHECKING: @@ -95,6 +94,7 @@ class AlignTextConfig(PretrainedConfig): ```""" model_type = "align_text_model" + base_config_key = "text_config" def __init__( self, @@ -133,24 +133,6 @@ def __init__( self.use_cache = use_cache self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignVisionConfig(PretrainedConfig): r""" @@ -223,6 +205,7 @@ class AlignVisionConfig(PretrainedConfig): ```""" model_type = "align_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -272,24 +255,6 @@ def __init__( self.drop_connect_rate = drop_connect_rate self.num_hidden_layers = sum(num_block_repeats) * 4 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AlignConfig - if config_dict.get("model_type") == "align": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AlignConfig(PretrainedConfig): r""" @@ -340,6 +305,7 @@ class AlignConfig(PretrainedConfig): ```""" model_type = "align" + sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig} def __init__( self, diff --git a/src/transformers/models/altclip/configuration_altclip.py b/src/transformers/models/altclip/configuration_altclip.py index 7333fa63a35280..3c8e91bd473533 100755 --- a/src/transformers/models/altclip/configuration_altclip.py +++ b/src/transformers/models/altclip/configuration_altclip.py @@ -14,9 +14,6 @@ # limitations under the License. """AltCLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -199,6 +196,7 @@ class AltCLIPVisionConfig(PretrainedConfig): ```""" model_type = "altclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -233,24 +231,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from AltCLIPConfig - if config_dict.get("model_type") == "altclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class AltCLIPConfig(PretrainedConfig): r""" @@ -298,6 +278,7 @@ class AltCLIPConfig(PretrainedConfig): ```""" model_type = "altclip" + sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/bark/configuration_bark.py b/src/transformers/models/bark/configuration_bark.py index 6dd08b65e89e6c..a498d1dd19371d 100644 --- a/src/transformers/models/bark/configuration_bark.py +++ b/src/transformers/models/bark/configuration_bark.py @@ -14,12 +14,11 @@ # limitations under the License. """BARK model configuration""" -import os -from typing import Dict, Optional, Union +from typing import Dict from ...configuration_utils import PretrainedConfig from ...utils import add_start_docstrings, logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -64,7 +63,6 @@ class BarkSubModelConfig(PretrainedConfig): - model_type = "bark_module" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { @@ -101,38 +99,6 @@ def __init__( super().__init__(**kwargs) - @classmethod - def from_pretrained( - cls, - pretrained_model_name_or_path: Union[str, os.PathLike], - cache_dir: Optional[Union[str, os.PathLike]] = None, - force_download: bool = False, - local_files_only: bool = False, - token: Optional[Union[str, bool]] = None, - revision: str = "main", - **kwargs, - ) -> "PretrainedConfig": - kwargs["cache_dir"] = cache_dir - kwargs["force_download"] = force_download - kwargs["local_files_only"] = local_files_only - kwargs["revision"] = revision - - cls._set_token_in_kwargs(kwargs, token) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the config dict if we are loading from Bark - if config_dict.get("model_type") == "bark": - config_dict = config_dict[f"{cls.model_type}_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @add_start_docstrings( BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"), @@ -154,6 +120,7 @@ def from_pretrained( ) class BarkSemanticConfig(BarkSubModelConfig): model_type = "semantic" + base_config_key = "semantic_config" @add_start_docstrings( @@ -176,6 +143,7 @@ class BarkSemanticConfig(BarkSubModelConfig): ) class BarkCoarseConfig(BarkSubModelConfig): model_type = "coarse_acoustics" + base_config_key = "coarse_acoustics_config" @add_start_docstrings( @@ -203,6 +171,7 @@ class BarkCoarseConfig(BarkSubModelConfig): ) class BarkFineConfig(BarkSubModelConfig): model_type = "fine_acoustics" + base_config_key = "fine_acoustics_config" def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs): self.n_codes_total = n_codes_total @@ -265,6 +234,12 @@ class BarkConfig(PretrainedConfig): """ model_type = "bark" + sub_configs = { + "semantic_config": BarkSemanticConfig, + "coarse_acoustics_config": BarkCoarseConfig, + "fine_acoustics_config": BarkFineConfig, + "codec_config": AutoConfig, + } def __init__( self, diff --git a/src/transformers/models/blip/configuration_blip.py b/src/transformers/models/blip/configuration_blip.py index 4772738be10352..18db71eb14890b 100644 --- a/src/transformers/models/blip/configuration_blip.py +++ b/src/transformers/models/blip/configuration_blip.py @@ -14,9 +14,6 @@ # limitations under the License. """Blip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -96,6 +93,7 @@ class BlipTextConfig(PretrainedConfig): ```""" model_type = "blip_text_model" + base_config_key = "text_config" def __init__( self, @@ -146,24 +144,6 @@ def __init__( self.use_cache = use_cache self.label_smoothing = label_smoothing - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipVisionConfig(PretrainedConfig): r""" @@ -215,6 +195,7 @@ class BlipVisionConfig(PretrainedConfig): ```""" model_type = "blip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -245,24 +226,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from BlipConfig - if config_dict.get("model_type") == "blip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BlipConfig(PretrainedConfig): r""" @@ -316,6 +279,7 @@ class BlipConfig(PretrainedConfig): ```""" model_type = "blip" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} def __init__( self, diff --git a/src/transformers/models/blip_2/configuration_blip_2.py b/src/transformers/models/blip_2/configuration_blip_2.py index 16fa4aec38492b..d690d22338a687 100644 --- a/src/transformers/models/blip_2/configuration_blip_2.py +++ b/src/transformers/models/blip_2/configuration_blip_2.py @@ -14,13 +14,12 @@ # limitations under the License. """BLIP-2 model configuration""" -import os -from typing import Optional, Union +from typing import Optional from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -76,6 +75,7 @@ class Blip2VisionConfig(PretrainedConfig): ```""" model_type = "blip_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -106,24 +106,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2QFormerConfig(PretrainedConfig): r""" @@ -190,6 +172,7 @@ class Blip2QFormerConfig(PretrainedConfig): ```""" model_type = "blip_2_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +212,6 @@ def __init__( self.encoder_hidden_size = encoder_hidden_size self.use_qformer_text_input = use_qformer_text_input - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from Blip2Config - if config_dict.get("model_type") == "blip-2": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Blip2Config(PretrainedConfig): r""" @@ -306,6 +271,7 @@ class Blip2Config(PretrainedConfig): ```""" model_type = "blip-2" + sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig} def __init__( self, diff --git a/src/transformers/models/bridgetower/configuration_bridgetower.py b/src/transformers/models/bridgetower/configuration_bridgetower.py index 4985b6ef89fec2..de49283493b63f 100644 --- a/src/transformers/models/bridgetower/configuration_bridgetower.py +++ b/src/transformers/models/bridgetower/configuration_bridgetower.py @@ -14,9 +14,6 @@ # limitations under the License. """BridgeTower model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -68,6 +65,7 @@ class BridgeTowerVisionConfig(PretrainedConfig): ```""" model_type = "bridgetower_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -95,21 +93,6 @@ def __init__( self.share_layernorm = share_layernorm self.remove_last_layer = remove_last_layer - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerTextConfig(PretrainedConfig): r""" @@ -175,6 +158,7 @@ class BridgeTowerTextConfig(PretrainedConfig): ```""" model_type = "bridgetower_text_model" + base_config_key = "text_config" def __init__( self, @@ -217,21 +201,6 @@ def __init__( self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "bridgetower": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class BridgeTowerConfig(PretrainedConfig): r""" @@ -288,6 +257,7 @@ class BridgeTowerConfig(PretrainedConfig): ```""" model_type = "bridgetower" + sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} def __init__( self, diff --git a/src/transformers/models/chameleon/configuration_chameleon.py b/src/transformers/models/chameleon/configuration_chameleon.py index 67de37f2d01b2c..9842127e7bb48f 100644 --- a/src/transformers/models/chameleon/configuration_chameleon.py +++ b/src/transformers/models/chameleon/configuration_chameleon.py @@ -62,6 +62,7 @@ class ChameleonVQVAEConfig(PretrainedConfig): """ model_type = "chameleon_vqgan" + base_config_key = "vq_config" def __init__( self, @@ -187,6 +188,7 @@ class ChameleonConfig(PretrainedConfig): ```""" model_type = "chameleon" + sub_configs = {"vq_config": ChameleonVQVAEConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/chinese_clip/configuration_chinese_clip.py b/src/transformers/models/chinese_clip/configuration_chinese_clip.py index 5b37044fab500d..d50d6c842b313c 100644 --- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py +++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """Chinese-CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -102,6 +101,7 @@ class ChineseCLIPTextConfig(PretrainedConfig): ```""" model_type = "chinese_clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -141,24 +141,6 @@ def __init__( self.position_embedding_type = position_embedding_type self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPVisionConfig(PretrainedConfig): r""" @@ -215,6 +197,7 @@ class ChineseCLIPVisionConfig(PretrainedConfig): ```""" model_type = "chinese_clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -249,24 +232,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from ChineseCLIPConfig - if config_dict.get("model_type") == "chinese_clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ChineseCLIPConfig(PretrainedConfig): r""" @@ -316,6 +281,7 @@ class ChineseCLIPConfig(PretrainedConfig): ```""" model_type = "chinese_clip" + sub_configs = {"text_config": ChineseCLIPTextConfig, "vision_config": ChineseCLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clap/configuration_clap.py b/src/transformers/models/clap/configuration_clap.py index 1425e2a86289cc..b2added7f0e073 100644 --- a/src/transformers/models/clap/configuration_clap.py +++ b/src/transformers/models/clap/configuration_clap.py @@ -14,9 +14,6 @@ # limitations under the License. """CLAP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -94,6 +91,7 @@ class ClapTextConfig(PretrainedConfig): ```""" model_type = "clap_text_model" + base_config_key = "text_config" def __init__( self, @@ -137,24 +135,6 @@ def __init__( self.projection_hidden_act = projection_hidden_act self.projection_dim = projection_dim - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapAudioConfig(PretrainedConfig): r""" @@ -245,6 +225,7 @@ class ClapAudioConfig(PretrainedConfig): ```""" model_type = "clap_audio_model" + base_config_key = "audio_config" def __init__( self, @@ -307,24 +288,6 @@ def __init__( self.initializer_factor = initializer_factor self.projection_hidden_act = projection_hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the audio config dict if we are loading from ClapConfig - if config_dict.get("model_type") == "clap": - config_dict = config_dict["audio_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClapConfig(PretrainedConfig): r""" @@ -377,6 +340,7 @@ class ClapConfig(PretrainedConfig): ```""" model_type = "clap" + sub_configs = {"text_config": ClapTextConfig, "audio_config": ClapAudioConfig} def __init__( self, diff --git a/src/transformers/models/clip/configuration_clip.py b/src/transformers/models/clip/configuration_clip.py index 8e027f5c3f010f..2e1f2deede00c9 100644 --- a/src/transformers/models/clip/configuration_clip.py +++ b/src/transformers/models/clip/configuration_clip.py @@ -14,9 +14,8 @@ # limitations under the License. """CLIP model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional if TYPE_CHECKING: @@ -93,6 +92,7 @@ class CLIPTextConfig(PretrainedConfig): ```""" model_type = "clip_text_model" + base_config_key = "text_config" def __init__( self, @@ -130,24 +130,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPVisionConfig(PretrainedConfig): r""" @@ -205,6 +187,7 @@ class CLIPVisionConfig(PretrainedConfig): ```""" model_type = "clip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +222,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPConfig - if config_dict.get("model_type") == "clip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPConfig(PretrainedConfig): r""" @@ -305,6 +270,7 @@ class CLIPConfig(PretrainedConfig): ```""" model_type = "clip" + sub_configs = {"text_config": CLIPTextConfig, "vision_config": CLIPVisionConfig} def __init__( self, text_config=None, vision_config=None, projection_dim=512, logit_scale_init_value=2.6592, **kwargs diff --git a/src/transformers/models/clipseg/configuration_clipseg.py b/src/transformers/models/clipseg/configuration_clipseg.py index 0ac8196fc7f546..5474840f357a34 100644 --- a/src/transformers/models/clipseg/configuration_clipseg.py +++ b/src/transformers/models/clipseg/configuration_clipseg.py @@ -14,9 +14,6 @@ # limitations under the License. """CLIPSeg model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -84,6 +81,7 @@ class CLIPSegTextConfig(PretrainedConfig): ```""" model_type = "clipseg_text_model" + base_config_key = "text_config" def __init__( self, @@ -117,24 +115,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegVisionConfig(PretrainedConfig): r""" @@ -190,6 +170,7 @@ class CLIPSegVisionConfig(PretrainedConfig): ```""" model_type = "clipseg_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -222,24 +203,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from CLIPSegConfig - if config_dict.get("model_type") == "clipseg": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class CLIPSegConfig(PretrainedConfig): r""" @@ -306,6 +269,7 @@ class CLIPSegConfig(PretrainedConfig): ```""" model_type = "clipseg" + sub_configs = {"text_config": CLIPSegTextConfig, "vision_config": CLIPSegVisionConfig} def __init__( self, diff --git a/src/transformers/models/clvp/configuration_clvp.py b/src/transformers/models/clvp/configuration_clvp.py index d17a04c861bf3b..8fd0e150801a66 100644 --- a/src/transformers/models/clvp/configuration_clvp.py +++ b/src/transformers/models/clvp/configuration_clvp.py @@ -91,6 +91,7 @@ class ClvpEncoderConfig(PretrainedConfig): ```""" model_type = "clvp_encoder" + base_config_key = ["text_config", "speech_config"] def __init__( self, @@ -141,7 +142,7 @@ def from_pretrained( # make sure to have the config_type be either "text_config" or "speech_config" # this is to make sure that we can load only text or speech configs from the nested ClvpConfig. - if config_type not in ["text_config", "speech_config"]: + if config_type not in cls.base_config_key: raise ValueError( f"We can only load either 'text_config' or 'speech_config' but you are trying to load" f"{config_type}" ) @@ -253,6 +254,7 @@ class ClvpDecoderConfig(PretrainedConfig): ```""" model_type = "clvp_decoder" + base_config_key = "decoder_config" def __init__( self, @@ -314,24 +316,6 @@ def __init__( super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the speech config dict if we are loading from ClvpConfig - if config_dict.get("model_type") == "clvp": - config_dict = config_dict["decoder_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class ClvpConfig(PretrainedConfig): r""" @@ -386,7 +370,11 @@ class ClvpConfig(PretrainedConfig): ```""" model_type = "clvp" - is_composition = True + sub_configs = { + "text_config": ClvpEncoderConfig, + "speech_config": ClvpEncoderConfig, + "decoder_config": ClvpDecoderConfig, + } def __init__( self, diff --git a/src/transformers/models/dbrx/configuration_dbrx.py b/src/transformers/models/dbrx/configuration_dbrx.py index dde5232ae5cc9b..302b5e6a55821d 100644 --- a/src/transformers/models/dbrx/configuration_dbrx.py +++ b/src/transformers/models/dbrx/configuration_dbrx.py @@ -41,6 +41,8 @@ class DbrxAttentionConfig(PretrainedConfig): rope_theta (`float`, *optional*, defaults to 10000.0): The base frequency for rope. """ + base_config_key = "attn_config" + def __init__( self, attn_pdrop: float = 0.0, @@ -55,29 +57,12 @@ def __init__( self.kv_n_heads = kv_n_heads self.rope_theta = rope_theta - for k in ["model_type"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxFFNConfig(PretrainedConfig): """Configuration class for Dbrx FFN. @@ -100,6 +85,8 @@ class DbrxFFNConfig(PretrainedConfig): moe_normalize_expert_weights (`float`, *optional*, defaults to 1.0): The normalization factor for the expert weights. """ + base_config_key = "ffn_config" + def __init__( self, ffn_act_fn: dict = None, @@ -122,29 +109,12 @@ def __init__( self.moe_loss_weight = moe_loss_weight self.moe_normalize_expert_weights = moe_normalize_expert_weights - for k in ["model_type"]: + for k in ["model_type", "attn_implementation", "transformers_version", "_commit_hash"]: if k in kwargs: kwargs.pop(k) if len(kwargs) != 0: raise ValueError(f"Found unknown {kwargs=}") - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: str, **kwargs: Any) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "dbrx": - config_dict = config_dict["ffn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class DbrxConfig(PretrainedConfig): r""" @@ -202,6 +172,7 @@ class DbrxConfig(PretrainedConfig): """ model_type = "dbrx" + sub_configs = {"attn_config": DbrxAttentionConfig, "ffn_config": DbrxFFNConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/detr/image_processing_detr_fast.py b/src/transformers/models/detr/image_processing_detr_fast.py index eadde59e55e475..b414b4224e683c 100644 --- a/src/transformers/models/detr/image_processing_detr_fast.py +++ b/src/transformers/models/detr/image_processing_detr_fast.py @@ -347,7 +347,7 @@ def __init__( format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, do_resize: bool = True, size: Dict[str, int] = None, - resample: [Union[PILImageResampling, F.InterpolationMode]] = PILImageResampling.BILINEAR, + resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = True, @@ -462,7 +462,7 @@ def resize( self, image: torch.Tensor, size: SizeDict, - interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, + interpolation: "F.InterpolationMode" = None, **kwargs, ) -> torch.Tensor: """ @@ -485,6 +485,7 @@ def resize( interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): Resampling filter to use if resizing the image. """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR if size.shortest_edge and size.longest_edge: # Resize the image so that the shortest edge or the longest edge is of the given size # while maintaining the aspect ratio of the original image. @@ -517,7 +518,7 @@ def resize_annotation( orig_size: Tuple[int, int], target_size: Tuple[int, int], threshold: float = 0.5, - interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST, + interpolation: "F.InterpolationMode" = None, ): """ Resizes an annotation to a target size. @@ -534,6 +535,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): The resampling filter to use when resizing the masks. """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} @@ -680,7 +682,7 @@ def preprocess( masks_path: Optional[Union[str, pathlib.Path]] = None, do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, - resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None, + resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[Union[int, float]] = None, do_normalize: Optional[bool] = None, diff --git a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py index ab5d49b32fea90..5190ed51ffd350 100644 --- a/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/configuration_encoder_decoder.py @@ -17,6 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging +from ..auto import AutoConfig logger = logging.get_logger(__name__) @@ -70,6 +71,7 @@ class EncoderDecoderConfig(PretrainedConfig): ```""" model_type = "encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): @@ -84,8 +86,6 @@ def __init__(self, **kwargs): decoder_config = kwargs.pop("decoder") decoder_model_type = decoder_config.pop("model_type") - from ..auto.configuration_auto import AutoConfig - self.encoder = AutoConfig.for_model(encoder_model_type, **encoder_config) self.decoder = AutoConfig.for_model(decoder_model_type, **decoder_config) self.is_encoder_decoder = True diff --git a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py index ade5b8b2667537..59a1b029751646 100644 --- a/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/configuration_fastspeech2_conformer.py @@ -164,6 +164,7 @@ class FastSpeech2ConformerConfig(PretrainedConfig): ```""" model_type = "fastspeech2_conformer" + base_config_key = "model_config" attribute_map = {"num_hidden_layers": "encoder_layers", "num_attention_heads": "encoder_num_attention_heads"} def __init__( @@ -377,6 +378,7 @@ class FastSpeech2ConformerHifiGanConfig(PretrainedConfig): ```""" model_type = "hifigan" + base_config_key = "vocoder_config" def __init__( self, @@ -453,7 +455,7 @@ class FastSpeech2ConformerWithHifiGanConfig(PretrainedConfig): """ model_type = "fastspeech2_conformer_with_hifigan" - is_composition = True + sub_configs = {"model_config": FastSpeech2ConformerConfig, "vocoder_config": FastSpeech2ConformerHifiGanConfig} def __init__( self, diff --git a/src/transformers/models/flava/configuration_flava.py b/src/transformers/models/flava/configuration_flava.py index b6349361c0dda8..47cdb488a2eb5d 100644 --- a/src/transformers/models/flava/configuration_flava.py +++ b/src/transformers/models/flava/configuration_flava.py @@ -14,8 +14,7 @@ # limitations under the License. """FLAVA model configurations""" -import os -from typing import Any, Dict, Union +from typing import Any, Dict from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -86,6 +85,7 @@ class FlavaImageConfig(PretrainedConfig): ```""" model_type = "flava_image_model" + base_config_key = "image_config" def __init__( self, @@ -124,24 +124,6 @@ def __init__( self.mask_token = mask_token self.vocab_size = vocab_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaTextConfig(PretrainedConfig): r""" @@ -216,6 +198,7 @@ class FlavaTextConfig(PretrainedConfig): ```""" model_type = "flava_text_model" + base_config_key = "text_config" def __init__( self, @@ -254,24 +237,6 @@ def __init__( self.qkv_bias = qkv_bias self.pad_token_id = pad_token_id - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaMultimodalConfig(PretrainedConfig): r""" @@ -327,6 +292,7 @@ class FlavaMultimodalConfig(PretrainedConfig): ```""" model_type = "flava_multimodal_model" + base_config_key = "multimodal_config" def __init__( self, @@ -357,27 +323,10 @@ def __init__( self.qkv_bias = qkv_bias self.use_cls_token = use_cls_token - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the multimodal config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["multimodal_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaImageCodebookConfig(PretrainedConfig): model_type = "flava_image_codebook" + base_config_key = "image_codebook_config" r""" [`FlavaImageCodebookConfig`] is the configuration class to store the configuration of a [`FlavaImageCodebook`]. It @@ -442,24 +391,6 @@ def __init__( self.freeze = freeze self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the image codebook config dict if we are loading from FlavaConfig - if config_dict.get("model_type") == "flava": - config_dict = config_dict["image_codebook_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class FlavaConfig(PretrainedConfig): r""" @@ -532,6 +463,12 @@ class FlavaConfig(PretrainedConfig): """ model_type = "flava" + sub_configs = { + "text_config": FlavaTextConfig, + "image_config": FlavaImageConfig, + "multimodal_config": FlavaMultimodalConfig, + "image_codebook_config": FlavaImageCodebookConfig, + } def __init__( self, diff --git a/src/transformers/models/git/configuration_git.py b/src/transformers/models/git/configuration_git.py index ecaea17ff946af..1be3e7067bdfcf 100644 --- a/src/transformers/models/git/configuration_git.py +++ b/src/transformers/models/git/configuration_git.py @@ -13,8 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -72,6 +70,7 @@ class GitVisionConfig(PretrainedConfig): ```""" model_type = "git_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -102,24 +101,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GITConfig - if config_dict.get("model_type") == "git": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GitConfig(PretrainedConfig): r""" @@ -186,6 +167,7 @@ class GitConfig(PretrainedConfig): ```""" model_type = "git" + sub_configs = {"vision_config": GitVisionConfig} def __init__( self, diff --git a/src/transformers/models/groupvit/configuration_groupvit.py b/src/transformers/models/groupvit/configuration_groupvit.py index e608fbcdbe9c0a..e85e4fc9184371 100644 --- a/src/transformers/models/groupvit/configuration_groupvit.py +++ b/src/transformers/models/groupvit/configuration_groupvit.py @@ -14,9 +14,8 @@ # limitations under the License. """GroupViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Mapping, Optional from ...configuration_utils import PretrainedConfig from ...onnx import OnnxConfig @@ -86,6 +85,7 @@ class GroupViTTextConfig(PretrainedConfig): ```""" model_type = "groupvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -121,24 +121,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTVisionConfig(PretrainedConfig): r""" @@ -197,6 +179,7 @@ class GroupViTVisionConfig(PretrainedConfig): ```""" model_type = "groupvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -246,24 +229,6 @@ def __init__( self.assign_eps = assign_eps self.assign_mlp_ratio = assign_mlp_ratio - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from GroupViTConfig - if config_dict.get("model_type") == "groupvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class GroupViTConfig(PretrainedConfig): r""" @@ -292,6 +257,7 @@ class GroupViTConfig(PretrainedConfig): """ model_type = "groupvit" + sub_configs = {"text_config": GroupViTTextConfig, "vision_config": GroupViTVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics/configuration_idefics.py b/src/transformers/models/idefics/configuration_idefics.py index 56b6025a8e89dd..e34a5764400196 100644 --- a/src/transformers/models/idefics/configuration_idefics.py +++ b/src/transformers/models/idefics/configuration_idefics.py @@ -38,7 +38,7 @@ class IdeficsVisionConfig(PretrainedConfig): documentation from [`PretrainedConfig`] for more information. Args: - hidden_size (`int`, *optional*, defaults to 768): + embed_dim (`int`, *optional*, defaults to 768): Dimensionality of the encoder layers and the pooler layer. (elsewhere referred to as `hidden_size`) image_size (`int`, *optional*, defaults to 224): The size (resolution) of each image. @@ -50,12 +50,12 @@ class IdeficsVisionConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 16): Number of attention heads for each attention layer in the Transformer encoder. - image_num_channels (`int`, *optional*, defaults to `3`): + num_channels (`int`, *optional*, defaults to 3): Number of image channels. hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. @@ -64,11 +64,9 @@ class IdeficsVisionConfig(PretrainedConfig): initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1.0, used internally for initialization testing). - initializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation of the truncated_normal_initializer for initializing all weight matrices. """ - model_type = "idefics" + model_type = "idefics_vision" attribute_map = { "hidden_size": "embed_dim", } @@ -119,7 +117,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Args: use_resampler (`bool`, *optional*, defaults to `False`): Whether or not to use the resampler - resampler_n_latents (`int`, *optional*, defaults to ): + resampler_n_latents (`int`, *optional*, defaults to 64): Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). resampler_depth (`int`, *optional*, defaults to 6): Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (< 3). @@ -131,7 +129,7 @@ class IdeficsPerceiverConfig(PretrainedConfig): Whether or not to use qk layer norms in perceiver """ - model_type = "idefics" + model_type = "idefics_perciever" def __init__( self, @@ -235,7 +233,7 @@ class IdeficsConfig(PretrainedConfig): ```""" model_type = "idefics" - is_composition = False + sub_configs = {"perceiver_config": IdeficsPerceiverConfig, "vision_config": IdeficsVisionConfig} def __init__( self, diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py index 64743d1cd470e7..408d374c77f7eb 100644 --- a/src/transformers/models/idefics2/configuration_idefics2.py +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -13,12 +13,9 @@ # limitations under the License. """Idefics2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -76,7 +73,8 @@ class Idefics2VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics2" + model_type = "idefics2_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +105,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics2Config - if config_dict.get("model_type") == "idefics2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics2PerceiverConfig(PretrainedConfig): r""" @@ -152,7 +132,7 @@ class Idefics2PerceiverConfig(PretrainedConfig): The dropout ratio for the attention probabilities. """ - model_type = "idefics2" + model_type = "idefics2_perceiver" def __init__( self, @@ -220,7 +200,11 @@ class Idefics2Config(PretrainedConfig): ```""" model_type = "idefics2" - is_composition = True + sub_configs = { + "text_config": AutoConfig, + "perceiver_config": Idefics2PerceiverConfig, + "vision_config": Idefics2VisionConfig, + } def __init__( self, diff --git a/src/transformers/models/idefics3/configuration_idefics3.py b/src/transformers/models/idefics3/configuration_idefics3.py index 45afe685f5209c..4b10d8d2d03a81 100644 --- a/src/transformers/models/idefics3/configuration_idefics3.py +++ b/src/transformers/models/idefics3/configuration_idefics3.py @@ -13,12 +13,9 @@ # limitations under the License. """Idefics3 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -57,8 +54,7 @@ class Idefics3VisionConfig(PretrainedConfig): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. - intializer_range (`float`, *optional*, defaults to 0.02): - The standard deviation for initializing all weight matrices in the model. + initializer_range (``, *optional*, defaults to 0.02): Example: @@ -76,7 +72,8 @@ class Idefics3VisionConfig(PretrainedConfig): >>> configuration = model.config ```""" - model_type = "idefics3" + model_type = "idefics3_vision" + base_config_key = "vision_config" def __init__( self, @@ -107,24 +104,6 @@ def __init__( self.hidden_act = hidden_act self.initializer_range = initializer_range - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Idefics3Config - if config_dict.get("model_type") == "idefics3": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Idefics3Config(PretrainedConfig): r""" @@ -165,7 +144,7 @@ class Idefics3Config(PretrainedConfig): ```""" model_type = "idefics3" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": Idefics3VisionConfig} def __init__( self, @@ -204,4 +183,4 @@ def __init__( self.text_config = text_config self.scale_factor = scale_factor - super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) + super().__init__(**kwargs, pad_token_id=pad_token_id, tie_word_embeddings=tie_word_embeddings) diff --git a/src/transformers/models/instructblip/configuration_instructblip.py b/src/transformers/models/instructblip/configuration_instructblip.py index a274212a945e04..6124dba3a08efe 100644 --- a/src/transformers/models/instructblip/configuration_instructblip.py +++ b/src/transformers/models/instructblip/configuration_instructblip.py @@ -14,13 +14,10 @@ # limitations under the License. """InstructBLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,6 +75,7 @@ class InstructBlipVisionConfig(PretrainedConfig): ```""" model_type = "instructblip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -108,24 +106,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipQFormerConfig(PretrainedConfig): r""" @@ -192,6 +172,7 @@ class InstructBlipQFormerConfig(PretrainedConfig): ```""" model_type = "instructblip_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -229,24 +210,6 @@ def __init__( self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipConfig - if config_dict.get("model_type") == "instructblip": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipConfig(PretrainedConfig): r""" @@ -305,6 +268,11 @@ class InstructBlipConfig(PretrainedConfig): ```""" model_type = "instructblip" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipQFormerConfig, + "vision_config": InstructBlipVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py index e7c8eeccef98b4..14687a96e54f37 100644 --- a/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/configuration_instructblipvideo.py @@ -19,13 +19,11 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os -from typing import Union from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -83,6 +81,7 @@ class InstructBlipVideoVisionConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -113,24 +112,6 @@ def __init__( self.hidden_act = hidden_act self.qkv_bias = qkv_bias - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoQFormerConfig(PretrainedConfig): r""" @@ -197,6 +178,7 @@ class InstructBlipVideoQFormerConfig(PretrainedConfig): ```""" model_type = "instructblipvideo_qformer" + base_config_key = "qformer_config" def __init__( self, @@ -234,24 +216,6 @@ def __init__( self.cross_attention_frequency = cross_attention_frequency self.encoder_hidden_size = encoder_hidden_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the qformer config dict if we are loading from InstructBlipVideoConfig - if config_dict.get("model_type") == "instructblipvideo": - config_dict = config_dict["qformer_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class InstructBlipVideoConfig(PretrainedConfig): r""" @@ -310,6 +274,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py index 63c6c486854c57..b0dc8a215740f1 100644 --- a/src/transformers/models/instructblipvideo/modular_instructblipvideo.py +++ b/src/transformers/models/instructblipvideo/modular_instructblipvideo.py @@ -32,7 +32,7 @@ from ...configuration_utils import PretrainedConfig from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -103,6 +103,11 @@ class InstructBlipVideoConfig(PretrainedConfig): ```""" model_type = "instructblipvideo" + sub_configs = { + "text_config": AutoConfig, + "qformer_config": InstructBlipVideoQFormerConfig, + "vision_config": InstructBlipVideoVisionConfig, + } def __init__( self, diff --git a/src/transformers/models/kosmos2/configuration_kosmos2.py b/src/transformers/models/kosmos2/configuration_kosmos2.py index e49074f8061b2c..921ec336c0be80 100644 --- a/src/transformers/models/kosmos2/configuration_kosmos2.py +++ b/src/transformers/models/kosmos2/configuration_kosmos2.py @@ -14,9 +14,6 @@ # limitations under the License. """KOSMOS-2 model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -61,7 +58,7 @@ class Kosmos2TextConfig(PretrainedConfig): layerdrop (`float`, *optional*, defaults to 0.0): The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) for more details. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. init_std (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. @@ -69,9 +66,16 @@ class Kosmos2TextConfig(PretrainedConfig): Scale embeddings by diving by sqrt(embed_dim). use_cache (`bool`, *optional*, defaults to `True`): Whether or not the model should return the last key/values attentions (not used by all models). + pad_token_id (`int`, *optional*, defaults to 1): + Token id used for padding. + bos_token_id (`int`, *optional*, defaults to 0): + Token id used for beginning of string. + eos_token_id (`int`, *optional*, defaults to 2): + Token id used for end of string. ```""" model_type = "kosmos_2_text_model" + base_config_key = "text_config" keys_to_ignore_at_inference = ["past_key_values"] attribute_map = { "num_attention_heads": "attention_heads", @@ -124,24 +128,6 @@ def __init__( self.scale_embedding = scale_embedding self.use_cache = use_cache - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2VisionConfig(PretrainedConfig): r""" @@ -171,18 +157,19 @@ class Kosmos2VisionConfig(PretrainedConfig): hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. - layer_norm_eps (`float`, *optional*, defaults to 1e-5): + layer_norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation of the truncated_normal_initializer for initializing all weight matrices. - initializer_factor (`float`, *optional*, defaults to 1): + initializer_factor (`float`, *optional*, defaults to 1.0): A factor for initializing all weight matrices (should be kept to 1, used internally for initialization testing). ```""" model_type = "kosmos_2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -215,24 +202,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Kosmos2Config - if config_dict.get("model_type") == "kosmos-2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Kosmos2Config(PretrainedConfig): r""" @@ -267,7 +236,7 @@ class Kosmos2Config(PretrainedConfig): ```""" model_type = "kosmos-2" - is_composition = True + sub_configs = {"text_config": Kosmos2TextConfig, "vision_config": Kosmos2VisionConfig} def __init__( self, diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 3a4cb09855f0ec..05034f5cfcf6f8 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,7 +73,7 @@ class LlavaConfig(PretrainedConfig): ```""" model_type = "llava" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index e8768dde85722b..54616edbf96dce 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,7 +78,7 @@ class LlavaNextConfig(PretrainedConfig): ```""" model_type = "llava_next" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 0e4e39b4b3ab53..2fe889da60336b 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -21,7 +21,7 @@ from ...configuration_utils import PretrainedConfig -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig class LlavaNextVideoConfig(PretrainedConfig): @@ -86,7 +86,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 73118f4bfcd300..b0a20d6c5ccd93 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -95,7 +95,7 @@ def __init__(self, config): mode = config.spatial_pool_mode stride = config.spatial_pool_stride out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size) - self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2 + self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2 if mode == "average": self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride) diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 8018afa7244dd2..3d6431d7ea29ba 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -31,7 +31,7 @@ from ...utils import ( logging, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -99,7 +99,7 @@ class LlavaNextVideoConfig(PretrainedConfig): ```""" model_type = "llava_next_video" - is_composition = True + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, @@ -191,7 +191,7 @@ def __init__(self, config): mode = config.spatial_pool_mode stride = config.spatial_pool_stride out_channels = getattr(config, "spatial_pool_out_channels", config.vision_config.hidden_size) - self.image_size = config.vision_config.image_size // config.vision_config.patch_size**2 + self.image_size = (config.vision_config.image_size // config.vision_config.patch_size) ** 2 if mode == "average": self.pool = nn.AvgPool2d(kernel_size=stride, stride=stride) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index eef86c6c8c019b..46b65b35b1a5cb 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -18,7 +18,7 @@ from ...utils import ( logging, ) -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -81,7 +81,7 @@ class LlavaOnevisionConfig(PretrainedConfig): ```""" model_type = "llava_onevision" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mllama/configuration_mllama.py b/src/transformers/models/mllama/configuration_mllama.py index 539fc61ba4edba..635ca503205f5f 100644 --- a/src/transformers/models/mllama/configuration_mllama.py +++ b/src/transformers/models/mllama/configuration_mllama.py @@ -13,8 +13,7 @@ # limitations under the License. """Mllama model configuration""" -import os -from typing import Dict, List, Optional, Union +from typing import Dict, List, Optional from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation @@ -59,7 +58,7 @@ class MllamaVisionConfig(PretrainedConfig): The size (resolution) of each image *tile*. patch_size (`int`, *optional*, defaults to 14): The size (resolution) of each patch. - norm_eps (`float`, *optional*, defaults to 1e-5): + norm_eps (`float`, *optional*, defaults to 1e-05): The epsilon used by the layer normalization layers. max_num_tiles (`int`, *optional*, defaults to 4): Maximum number of tiles for image splitting. @@ -88,6 +87,7 @@ class MllamaVisionConfig(PretrainedConfig): ```""" model_type = "mllama_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -137,23 +137,6 @@ def __init__( def max_aspect_ratio_id(self) -> int: return len(self.supported_aspect_ratios) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaTextConfig(PretrainedConfig): r""" @@ -178,12 +161,12 @@ class MllamaTextConfig(PretrainedConfig): Number of hidden layers in the Transformer encoder. num_attention_heads (`int`, *optional*, defaults to 32): Number of attention heads for each attention layer in the Transformer encoder. - num_key_value_heads (`int`, *optional*): + num_key_value_heads (`int`, *optional*, defaults to 8): This is the number of key_value heads that should be used to implement Grouped Query Attention. If not specified, will default to `num_attention_heads`. intermediate_size (`int`, *optional*, defaults to 14336): Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. - rope_theta (`float`, *optional*, defaults to 500000.0): + rope_theta (`float`, *optional*, defaults to `500000.0`): The base period of the RoPE embeddings. rope_scaling (`Dict`, *optional*): Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type @@ -259,6 +242,7 @@ class MllamaTextConfig(PretrainedConfig): ```""" model_type = "mllama_text_model" + base_config_key = "text_config" def __init__( self, @@ -311,23 +295,6 @@ def __init__( **kwargs, ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mllama": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MllamaConfig(PretrainedConfig): r""" @@ -370,7 +337,7 @@ class MllamaConfig(PretrainedConfig): ```""" model_type = "mllama" - is_composition = True + sub_configs = {"text_config": MllamaTextConfig, "vision_config": MllamaVisionConfig} def __init__( self, diff --git a/src/transformers/models/moshi/configuration_moshi.py b/src/transformers/models/moshi/configuration_moshi.py index 654e4e82a491b7..1b31141f020db5 100644 --- a/src/transformers/models/moshi/configuration_moshi.py +++ b/src/transformers/models/moshi/configuration_moshi.py @@ -235,8 +235,8 @@ class MoshiConfig(PretrainedConfig): ```""" model_type = "moshi" - is_composition = True keys_to_ignore_at_inference = ["past_key_values"] + sub_configs = {"audio_encoder_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/mpt/configuration_mpt.py b/src/transformers/models/mpt/configuration_mpt.py index ed822c813ba26e..8ee3f8c0c07428 100644 --- a/src/transformers/models/mpt/configuration_mpt.py +++ b/src/transformers/models/mpt/configuration_mpt.py @@ -41,22 +41,22 @@ class MptAttentionConfig(PretrainedConfig): Args: attn_type (`str`, *optional*, defaults to `"multihead_attention"`): type of attention to use. Options: `"multihead_attention"`, `"multiquery_attention"`. - attn_pdrop (`float`, *optional*, defaults to 0.0): + attn_pdrop (`float`, *optional*, defaults to `0.0`): The dropout probability for the attention layers. attn_impl (`str`, *optional*, defaults to `"torch"`): The attention implementation to use. One of `"torch"`, `"flash"`, or `"triton"`. clip_qkv (`float`, *optional*): If not `None`, clip the queries, keys, and values in the attention layer to this value. - softmax_scale (`float`, *optional*, defaults to `None`): + softmax_scale (`float`, *optional*): If not `None`, scale the softmax in the attention layer by this value. If `None`, will default to `1/sqrt(hidden_size)`. - prefix_lm (`bool`, *optional*, defaults to `False`)): + prefix_lm (`bool`, *optional*, defaults to `False`): Whether the model should operate as a Prefix LM. This requires passing an extra `prefix_mask` argument which indicates which tokens belong to the prefix. Tokens in the prefix can attend to one another bi-directionally. Tokens outside the prefix use causal attention. qk_ln (`bool`, *optional*, defaults to `False`): Whether to apply layer normalization to the queries and keys in the attention layer. - attn_uses_sequence_id (`bool`, *optional*, defaults to `False`)): + attn_uses_sequence_id (`bool`, *optional*, defaults to `False`): Whether to restrict attention to tokens that have the same token_type_ids. When the model is in `train` mode, this requires passing an extra *token_type_ids* argument which indicates which sub-sequence each token belongs to. Defaults to `False` meaning any provided *token_type_ids* will be ignored. @@ -66,6 +66,8 @@ class MptAttentionConfig(PretrainedConfig): The maximum value of the alibi bias. """ + base_config_key = "attn_config" + def __init__( self, attn_type="multihead_attention", @@ -97,23 +99,6 @@ def __init__( f"`attn_type` has to be either `multihead_attention` or `multiquery_attention`. Received: {attn_type}" ) - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "mpt": - config_dict = config_dict["attn_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class MptConfig(PretrainedConfig): """ @@ -188,6 +173,7 @@ class MptConfig(PretrainedConfig): """ model_type = "mpt" + sub_configs = {"attn_config": MptAttentionConfig} attribute_map = { "num_attention_heads": "n_heads", "hidden_size": "d_model", diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 0d282355defa96..00c03072198092 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -76,6 +76,7 @@ class MusicgenDecoderConfig(PretrainedConfig): """ model_type = "musicgen_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -189,6 +190,11 @@ class MusicgenConfig(PretrainedConfig): ```""" model_type = "musicgen" + sub_configs = { + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenDecoderConfig, + } is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 8a77cea0252234..e65ad50021c3ab 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -78,6 +78,7 @@ class MusicgenMelodyDecoderConfig(PretrainedConfig): """ model_type = "musicgen_melody_decoder" + base_config_key = "decoder_config" keys_to_ignore_at_inference = ["past_key_values"] def __init__( @@ -195,6 +196,11 @@ class MusicgenMelodyConfig(PretrainedConfig): ```""" model_type = "musicgen_melody" + sub_configs = { + "text_encoder": AutoConfig, + "audio_encoder": AutoConfig, + "decoder": MusicgenMelodyDecoderConfig, + } is_composition = True def __init__( diff --git a/src/transformers/models/owlv2/configuration_owlv2.py b/src/transformers/models/owlv2/configuration_owlv2.py index 43019553c5c6dc..f9085eaf9c1546 100644 --- a/src/transformers/models/owlv2/configuration_owlv2.py +++ b/src/transformers/models/owlv2/configuration_owlv2.py @@ -14,8 +14,7 @@ # limitations under the License. """OWLv2 model configuration""" -import os -from typing import TYPE_CHECKING, Dict, Union +from typing import TYPE_CHECKING, Dict if TYPE_CHECKING: @@ -90,6 +89,7 @@ class Owlv2TextConfig(PretrainedConfig): ```""" model_type = "owlv2_text_model" + base_config_key = "text_config" def __init__( self, @@ -123,24 +123,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTVisionConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2, 32->16 class Owlv2VisionConfig(PretrainedConfig): @@ -197,6 +179,7 @@ class Owlv2VisionConfig(PretrainedConfig): ```""" model_type = "owlv2_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -229,24 +212,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from Owlv2Config - if config_dict.get("model_type") == "owlv2": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - # Copied from transformers.models.owlvit.configuration_owlvit.OwlViTConfig with OwlViT->Owlv2, owlvit-base-patch32->owlv2-base-patch16, owlvit->owlv2, OWL-ViT->OWLv2 class Owlv2Config(PretrainedConfig): @@ -276,6 +241,7 @@ class Owlv2Config(PretrainedConfig): """ model_type = "owlv2" + sub_configs = {"text_config": Owlv2TextConfig, "vision_config": Owlv2VisionConfig} def __init__( self, @@ -304,20 +270,6 @@ def __init__( self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/owlvit/configuration_owlvit.py b/src/transformers/models/owlvit/configuration_owlvit.py index 877b348f32c121..8be707ce99a1c6 100644 --- a/src/transformers/models/owlvit/configuration_owlvit.py +++ b/src/transformers/models/owlvit/configuration_owlvit.py @@ -14,9 +14,8 @@ # limitations under the License. """OWL-ViT model configuration""" -import os from collections import OrderedDict -from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Mapping, Optional if TYPE_CHECKING: @@ -92,6 +91,7 @@ class OwlViTTextConfig(PretrainedConfig): ```""" model_type = "owlvit_text_model" + base_config_key = "text_config" def __init__( self, @@ -125,24 +125,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTVisionConfig(PretrainedConfig): r""" @@ -198,6 +180,7 @@ class OwlViTVisionConfig(PretrainedConfig): ```""" model_type = "owlvit_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -230,24 +213,6 @@ def __init__( self.initializer_range = initializer_range self.initializer_factor = initializer_factor - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from OwlViTConfig - if config_dict.get("model_type") == "owlvit": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class OwlViTConfig(PretrainedConfig): r""" @@ -276,6 +241,7 @@ class OwlViTConfig(PretrainedConfig): """ model_type = "owlvit" + sub_configs = {"text_config": OwlViTTextConfig, "vision_config": OwlViTVisionConfig} def __init__( self, @@ -304,20 +270,6 @@ def __init__( self.return_dict = return_dict self.initializer_factor = 1.0 - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - @classmethod def from_text_vision_configs(cls, text_config: Dict, vision_config: Dict, **kwargs): r""" diff --git a/src/transformers/models/paligemma/configuration_paligemma.py b/src/transformers/models/paligemma/configuration_paligemma.py index 64598436dbbf1f..de60c501292b30 100644 --- a/src/transformers/models/paligemma/configuration_paligemma.py +++ b/src/transformers/models/paligemma/configuration_paligemma.py @@ -17,7 +17,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -73,7 +73,7 @@ class PaliGemmaConfig(PretrainedConfig): ```""" model_type = "paligemma" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py index deb276f334723c..925aa60a8dc6de 100644 --- a/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/configuration_qwen2_audio.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -157,7 +157,7 @@ class Qwen2AudioConfig(PretrainedConfig): ```""" model_type = "qwen2_audio" - is_composition = False + sub_configs = {"text_config": AutoConfig, "audio_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py index 1349006e768cd4..55042327de4ec3 100644 --- a/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/configuration_qwen2_vl.py @@ -14,9 +14,6 @@ # limitations under the License. """Qwen2VL model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...modeling_rope_utils import rope_config_validation from ...utils import logging @@ -27,6 +24,7 @@ class Qwen2VLVisionConfig(PretrainedConfig): model_type = "qwen2_vl" + base_config_key = "vision_config" def __init__( self, @@ -55,23 +53,6 @@ def __init__( self.spatial_merge_size = spatial_merge_size self.temporal_patch_size = temporal_patch_size - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - if config_dict.get("model_type") == "qwen2_vl": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class Qwen2VLConfig(PretrainedConfig): r""" @@ -180,6 +161,7 @@ class Qwen2VLConfig(PretrainedConfig): ```""" model_type = "qwen2_vl" + sub_configs = {"vision_config": Qwen2VLVisionConfig} keys_to_ignore_at_inference = ["past_key_values"] def __init__( diff --git a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py index 9f63b5b7ced467..0470352d38f456 100644 --- a/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py +++ b/src/transformers/models/rt_detr/image_processing_rt_detr_fast.py @@ -43,7 +43,6 @@ get_image_type, infer_channel_dimension_format, make_list_of_images, - pil_torch_interpolation_mapping, validate_annotations, ) from ...utils import ( @@ -197,7 +196,7 @@ def __init__( format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, do_resize: bool = True, size: Dict[str, int] = None, - resample: Union[PILImageResampling, F.InterpolationMode] = PILImageResampling.BILINEAR, + resample: Union[PILImageResampling, "F.InterpolationMode"] = PILImageResampling.BILINEAR, do_rescale: bool = True, rescale_factor: Union[int, float] = 1 / 255, do_normalize: bool = False, @@ -256,7 +255,7 @@ def resize( self, image: torch.Tensor, size: SizeDict, - interpolation: F.InterpolationMode = F.InterpolationMode.BILINEAR, + interpolation: "F.InterpolationMode" = None, **kwargs, ) -> torch.Tensor: """ @@ -279,6 +278,7 @@ def resize( interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): Resampling filter to use if resizing the image. """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR if size.shortest_edge and size.longest_edge: # Resize the image so that the shortest edge or the longest edge is of the given size # while maintaining the aspect ratio of the original image. @@ -312,7 +312,7 @@ def resize_annotation( orig_size: Tuple[int, int], target_size: Tuple[int, int], threshold: float = 0.5, - interpolation: F.InterpolationMode = F.InterpolationMode.NEAREST, + interpolation: "F.InterpolationMode" = None, ): """ Resizes an annotation to a target size. @@ -329,6 +329,7 @@ def resize_annotation( resample (`InterpolationMode`, defaults to `InterpolationMode.NEAREST`): The resampling filter to use when resizing the masks. """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)] new_annotation = {} @@ -480,7 +481,7 @@ def preprocess( masks_path: Optional[Union[str, pathlib.Path]] = None, do_resize: Optional[bool] = None, size: Optional[Dict[str, int]] = None, - resample: Optional[Union[PILImageResampling, F.InterpolationMode]] = None, + resample: Optional[Union[PILImageResampling, "F.InterpolationMode"]] = None, do_rescale: Optional[bool] = None, rescale_factor: Optional[Union[int, float]] = None, do_normalize: Optional[bool] = None, diff --git a/src/transformers/models/siglip/configuration_siglip.py b/src/transformers/models/siglip/configuration_siglip.py index 73622373cbab5d..cc8fae93cdb25b 100644 --- a/src/transformers/models/siglip/configuration_siglip.py +++ b/src/transformers/models/siglip/configuration_siglip.py @@ -14,9 +14,6 @@ # limitations under the License. """Siglip model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class SiglipTextConfig(PretrainedConfig): ```""" model_type = "siglip_text_model" + base_config_key = "text_config" def __init__( self, @@ -110,24 +108,6 @@ def __init__( self.hidden_act = hidden_act self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipVisionConfig(PretrainedConfig): r""" @@ -178,6 +158,7 @@ class SiglipVisionConfig(PretrainedConfig): ```""" model_type = "siglip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -206,24 +187,6 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.hidden_act = hidden_act - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SiglipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class SiglipConfig(PretrainedConfig): r""" @@ -268,6 +231,7 @@ class SiglipConfig(PretrainedConfig): ```""" model_type = "siglip" + sub_configs = {"text_config": SiglipTextConfig, "vision_config": SiglipVisionConfig} def __init__(self, text_config=None, vision_config=None, **kwargs): super().__init__(**kwargs) diff --git a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py index 32a58ec5589eed..d7e0211610b657 100644 --- a/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py +++ b/src/transformers/models/speech_encoder_decoder/configuration_speech_encoder_decoder.py @@ -71,6 +71,7 @@ class SpeechEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "speech-encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/unispeech/modeling_unispeech.py b/src/transformers/models/unispeech/modeling_unispeech.py index 52ba08f5d4eda5..6ce5e77706d358 100755 --- a/src/transformers/models/unispeech/modeling_unispeech.py +++ b/src/transformers/models/unispeech/modeling_unispeech.py @@ -1577,7 +1577,7 @@ def forward( quantized_features, codevector_perplexity = self.quantizer(extract_features) # project quantized features twice - quantized_features = self.project_q(quantized_features) + quantized_features = self.project_q(quantized_features.to(self.project_q.weight.dtype)) quantized_features = self.project_hid(quantized_features) prob_replace_matrix = torch.empty(transformer_features.size(0), transformer_features.size(1)).fill_( diff --git a/src/transformers/models/video_llava/configuration_video_llava.py b/src/transformers/models/video_llava/configuration_video_llava.py index 8738a02585e039..87d96ca24ffdb4 100644 --- a/src/transformers/models/video_llava/configuration_video_llava.py +++ b/src/transformers/models/video_llava/configuration_video_llava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -78,7 +78,7 @@ class VideoLlavaConfig(PretrainedConfig): ```""" model_type = "video_llava" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index f88be5adfba028..f26c2b2f50fb6a 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -15,7 +15,7 @@ from ...configuration_utils import PretrainedConfig from ...utils import logging -from ..auto import CONFIG_MAPPING +from ..auto import CONFIG_MAPPING, AutoConfig logger = logging.get_logger(__name__) @@ -72,7 +72,7 @@ class VipLlavaConfig(PretrainedConfig): ```""" model_type = "vipllava" - is_composition = False + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} def __init__( self, diff --git a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py index a4aa663f98526f..59678f2573ff0e 100644 --- a/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py +++ b/src/transformers/models/vision_encoder_decoder/configuration_vision_encoder_decoder.py @@ -78,6 +78,7 @@ class VisionEncoderDecoderConfig(PretrainedConfig): ```""" model_type = "vision-encoder-decoder" + sub_configs = {"encoder": AutoConfig, "decoder": AutoConfig} is_composition = True def __init__(self, **kwargs): diff --git a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py index 4cea34ca2313bc..0d79720e1aa8d2 100644 --- a/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/configuration_vision_text_dual_encoder.py @@ -75,6 +75,7 @@ class VisionTextDualEncoderConfig(PretrainedConfig): ```""" model_type = "vision-text-dual-encoder" + sub_configs = {"vision_config": AutoConfig, "text_config": AutoConfig} is_composition = True def __init__(self, projection_dim=512, logit_scale_init_value=2.6592, **kwargs): diff --git a/src/transformers/models/x_clip/configuration_x_clip.py b/src/transformers/models/x_clip/configuration_x_clip.py index 827046b6c35380..3d3b92d2c8c02e 100644 --- a/src/transformers/models/x_clip/configuration_x_clip.py +++ b/src/transformers/models/x_clip/configuration_x_clip.py @@ -14,9 +14,6 @@ # limitations under the License. """X-CLIP model configuration""" -import os -from typing import Union - from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -79,6 +76,7 @@ class XCLIPTextConfig(PretrainedConfig): ```""" model_type = "xclip_text_model" + base_config_key = "text_config" def __init__( self, @@ -112,24 +110,6 @@ def __init__( self.initializer_factor = initializer_factor self.attention_dropout = attention_dropout - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the text config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["text_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPVisionConfig(PretrainedConfig): r""" @@ -195,6 +175,7 @@ class XCLIPVisionConfig(PretrainedConfig): ```""" model_type = "xclip_vision_model" + base_config_key = "vision_config" def __init__( self, @@ -239,24 +220,6 @@ def __init__( self.hidden_act = hidden_act self.drop_path_rate = drop_path_rate - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from XCLIPConfig - if config_dict.get("model_type") == "xclip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - - return cls.from_dict(config_dict, **kwargs) - class XCLIPConfig(PretrainedConfig): r""" @@ -295,6 +258,7 @@ class XCLIPConfig(PretrainedConfig): """ model_type = "xclip" + sub_configs = {"text_config": XCLIPTextConfig, "vision_config": XCLIPVisionConfig} def __init__( self, diff --git a/src/transformers/quantizers/quantizer_torchao.py b/src/transformers/quantizers/quantizer_torchao.py index f6bf431aa028d4..9a03eb25f4de0d 100644 --- a/src/transformers/quantizers/quantizer_torchao.py +++ b/src/transformers/quantizers/quantizer_torchao.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. import importlib +import types from typing import TYPE_CHECKING, Union from packaging import version @@ -30,9 +31,7 @@ if is_torch_available(): import torch - -if is_torchao_available(): - from torchao.quantization import quantize_ + import torch.nn as nn logger = logging.get_logger(__name__) @@ -46,6 +45,25 @@ def find_parent(model, name): return parent +def _quantization_type(weight): + from torchao.dtypes import AffineQuantizedTensor + from torchao.quantization.linear_activation_quantized_tensor import LinearActivationQuantizedTensor + + if isinstance(weight, AffineQuantizedTensor): + return f"{weight.__class__.__name__}({weight._quantization_type()})" + + if isinstance(weight, LinearActivationQuantizedTensor): + return f"{weight.__class__.__name__}(activation={weight.input_quant_func}, weight={_quantization_type(weight.original_weight_tensor)})" + + +def _linear_extra_repr(self): + weight = _quantization_type(self.weight) + if weight is None: + return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight=None" + else: + return f"in_features={self.weight.shape[1]}, out_features={self.weight.shape[0]}, weight={weight}" + + class TorchAoHfQuantizer(HfQuantizer): """ Quantizer for torchao: https://github.com/pytorch/ao/ @@ -152,9 +170,17 @@ def create_quantized_param( Each nn.Linear layer that needs to be quantized is processsed here. First, we set the value the weight tensor, then we move it to the target device. Finally, we quantize the module. """ + from torchao.quantization import quantize_ + module, tensor_name = get_module_from_name(model, param_name) - module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device) - quantize_(module, self.quantization_config.get_apply_tensor_subclass()) + + if self.pre_quantized: + module._parameters[tensor_name] = torch.nn.Parameter(param_value.to(device=target_device)) + if isinstance(module, nn.Linear): + module.extra_repr = types.MethodType(_linear_extra_repr, module) + else: + module._parameters[tensor_name] = torch.nn.Parameter(param_value).to(device=target_device) + quantize_(module, self.quantization_config.get_apply_tensor_subclass()) def _process_model_after_weight_loading(self, model): """No process required for torchao quantized model""" diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 43e13abe563115..381f3ef497d9bd 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1722,7 +1722,7 @@ def apply_chat_template( if start_token is None: # start_token is out of bounds maybe due to truncation. break - for token_id in range(start_token, end_token + 1 if end_token else len(input_ids)): + for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])): current_mask[token_id] = 1 assistant_masks.append(current_mask) out["assistant_masks"] = assistant_masks if is_batched else assistant_masks[0] diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 026a2066798574..2f04df97e863a5 100755 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -1309,7 +1309,8 @@ def get_apply_tensor_subclass(self): return _STR_TO_METHOD[self.quant_type](**self.quant_type_kwargs) def __repr__(self): - return f"{self.quant_type}({', '.join(str(k) + '=' + str(v) for k, v in self.kwargs.items())})" + config_dict = self.to_dict() + return f"{self.__class__.__name__} {json.dumps(config_dict, indent=2, sort_keys=True)}\n" @dataclass diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index 3bd8ce4b59c9b1..cbe851e97e9aed 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -185,16 +185,16 @@ def _get_logits_processor_kwargs(self, do_sample=False, config=None): # This is a band-aid for VLM models, to ensure they don't generate image/video tokens which would cause them # to crash. On pretrained models this isn't a risk, as they are trained to not generate these tokens. if config is not None: - image_token_index = ( - config.image_token_index - if getattr(config, "image_token_index", None) is not None - else getattr(config, "image_token_id", None) - ) - video_token_index = config.video_token_index if hasattr(config, "video_token_index") else None - if image_token_index is not None and image_token_index < config.get_text_config().vocab_size: - logits_processor_kwargs["bad_words_ids"].append([image_token_index]) - if video_token_index is not None and video_token_index < config.get_text_config().vocab_size: - logits_processor_kwargs["bad_words_ids"].append([video_token_index]) + for key in [ + "image_token_index", + "image_token_id", + "video_token_index", + "video_token_id", + "vision_start_token_id", + ]: + token_index = getattr(config, key, None) + if token_index is not None and token_index < config.get_text_config().vocab_size: + logits_processor_kwargs["bad_words_ids"].append([token_index]) return logits_processor_kwargs diff --git a/tests/models/albert/test_modeling_albert.py b/tests/models/albert/test_modeling_albert.py index 970f1dd8555e47..0a123c02ab778b 100644 --- a/tests/models/albert/test_modeling_albert.py +++ b/tests/models/albert/test_modeling_albert.py @@ -17,10 +17,11 @@ import unittest from packaging import version +from parameterized import parameterized from transformers import AlbertConfig, AutoTokenizer, is_torch_available from transformers.models.auto import get_values -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_sdpa, slow, torch_device from ...test_configuration_common import ConfigTester from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask @@ -288,6 +289,12 @@ def setUp(self): self.model_tester = AlbertModelTester(self) self.config_tester = ConfigTester(self, config_class=AlbertConfig, hidden_size=37) + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @unittest.skip("Albert requires `head_mask` which is currently not done in this test.") + def test_eager_matches_sdpa_inference(self): + pass + def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/align/test_modeling_align.py b/tests/models/align/test_modeling_align.py index ddeb585a757d5d..3c7e679686f617 100644 --- a/tests/models/align/test_modeling_align.py +++ b/tests/models/align/test_modeling_align.py @@ -457,11 +457,20 @@ class AlignModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = AlignModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AlignConfig, + has_text_modality=False, + common_properties=["projection_dim", "temperature_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Start to fail after using torch `cu118`.") def test_multi_gpu_data_parallel_forward(self): super().test_multi_gpu_data_parallel_forward() diff --git a/tests/models/altclip/test_modeling_altclip.py b/tests/models/altclip/test_modeling_altclip.py index 0175e562eda618..658e2e38d9adb5 100755 --- a/tests/models/altclip/test_modeling_altclip.py +++ b/tests/models/altclip/test_modeling_altclip.py @@ -452,11 +452,20 @@ def is_pipeline_test_to_skip( def setUp(self): self.model_tester = AltCLIPModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=AltCLIPConfig, + has_text_modality=False, + common_properties=["projection_dim", "logit_scale_init_value"], + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip/test_modeling_blip.py b/tests/models/blip/test_modeling_blip.py index d542757cbf879f..7e1dbbe6bb9cb0 100644 --- a/tests/models/blip/test_modeling_blip.py +++ b/tests/models/blip/test_modeling_blip.py @@ -449,11 +449,18 @@ class BlipModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = BlipModelTester(self) + common_properties = ["logit_scale_init_value", "image_text_hidden_size", "projection_dim", "label_smoothing"] + self.config_tester = ConfigTester( + self, config_class=BlipConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index 1ec9c2e1c07cdd..0943661b96666c 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -482,6 +482,13 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT def setUp(self): self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["image_token_index", "num_query_tokens", "image_text_hidden_size"] + self.config_tester = ConfigTester( + self, config_class=Blip2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/clap/test_modeling_clap.py b/tests/models/clap/test_modeling_clap.py index 9f8cc62d2e0fc3..60b77d0efa4b7b 100644 --- a/tests/models/clap/test_modeling_clap.py +++ b/tests/models/clap/test_modeling_clap.py @@ -515,11 +515,18 @@ class ClapModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = ClapModelTester(self) + common_properties = ["logit_scale_init_value", "projection_hidden_act", "projection_dim"] + self.config_tester = ConfigTester( + self, config_class=ClapConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clip/test_modeling_clip.py b/tests/models/clip/test_modeling_clip.py index a7c8c8ef8410e8..fa5de84e06205f 100644 --- a/tests/models/clip/test_modeling_clip.py +++ b/tests/models/clip/test_modeling_clip.py @@ -745,11 +745,18 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = CLIPModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/clipseg/test_modeling_clipseg.py b/tests/models/clipseg/test_modeling_clipseg.py index 75ffa7ad23c2ff..b2b047bb502cce 100644 --- a/tests/models/clipseg/test_modeling_clipseg.py +++ b/tests/models/clipseg/test_modeling_clipseg.py @@ -472,11 +472,18 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = CLIPSegModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=CLIPSegConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + def test_model_for_image_segmentation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model_for_image_segmentation(*config_and_inputs) diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py index 12e58500063a17..a212b4781d0a78 100644 --- a/tests/models/clvp/test_modeling_clvp.py +++ b/tests/models/clvp/test_modeling_clvp.py @@ -414,7 +414,13 @@ class ClvpModelForConditionalGenerationTest(ModelTesterMixin, unittest.TestCase) def setUp(self): self.model_tester = ClvpModelForConditionalGenerationTester(self) - self.clvp_config_tester = ConfigTester(self, config_class=ClvpConfig, hidden_size=32) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.clvp_config_tester = ConfigTester( + self, config_class=ClvpConfig, has_text_modality=False, common_properties=common_properties, hidden_size=32 + ) + + def test_config(self): + self.clvp_config_tester.run_common_tests() def tearDown(self): super().tearDown() diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py index 3a74a1557cf9ba..d4c51cea125720 100644 --- a/tests/models/distilbert/test_modeling_distilbert.py +++ b/tests/models/distilbert/test_modeling_distilbert.py @@ -30,6 +30,7 @@ import torch from transformers import ( + AutoTokenizer, DistilBertForMaskedLM, DistilBertForMultipleChoice, DistilBertForQuestionAnswering, @@ -38,6 +39,7 @@ DistilBertModel, ) from transformers.models.distilbert.modeling_distilbert import _create_sinusoidal_embeddings + from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_4 class DistilBertModelTester: @@ -420,3 +422,45 @@ def test_inference_no_head_absolute_embedding(self): ) self.assertTrue(torch.allclose(output[:, 1:4, 1:4], expected_slice, atol=1e-4)) + + @slow + def test_export(self): + if not is_torch_greater_or_equal_than_2_4: + self.skipTest(reason="This test requires torch >= 2.4 to run.") + + distilbert_model = "distilbert-base-uncased" + device = "cpu" + attn_implementation = "sdpa" + max_length = 64 + + tokenizer = AutoTokenizer.from_pretrained(distilbert_model) + inputs = tokenizer( + f"Paris is the {tokenizer.mask_token} of France.", + return_tensors="pt", + padding="max_length", + max_length=max_length, + ) + + model = DistilBertForMaskedLM.from_pretrained( + distilbert_model, + device_map=device, + attn_implementation=attn_implementation, + ) + + logits = model(**inputs).logits + eager_predicted_mask = tokenizer.decode(logits[0, 4].topk(5).indices) + self.assertEqual( + eager_predicted_mask.split(), + ["capital", "birthplace", "northernmost", "centre", "southernmost"], + ) + + exported_program = torch.export.export( + model, + args=(inputs["input_ids"],), + kwargs={"attention_mask": inputs["attention_mask"]}, + strict=True, + ) + + result = exported_program.module().forward(inputs["input_ids"], inputs["attention_mask"]) + exported_predicted_mask = tokenizer.decode(result.logits[0, 4].topk(5).indices) + self.assertEqual(eager_predicted_mask, exported_predicted_mask) diff --git a/tests/models/flava/test_modeling_flava.py b/tests/models/flava/test_modeling_flava.py index d8c8f385e9ce11..1c35fd705ccd87 100644 --- a/tests/models/flava/test_modeling_flava.py +++ b/tests/models/flava/test_modeling_flava.py @@ -931,11 +931,18 @@ class FlavaModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = self.class_for_tester(self) + common_properties = ["projection_dim", "logit_scale_init_value", "init_codebook"] + self.config_tester = ConfigTester( + self, config_class=FlavaConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs_for_common() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/glm/test_modeling_glm.py b/tests/models/glm/test_modeling_glm.py index b92c5db815b77a..ebac3b9167ce26 100644 --- a/tests/models/glm/test_modeling_glm.py +++ b/tests/models/glm/test_modeling_glm.py @@ -14,13 +14,9 @@ # limitations under the License. """Testing suite for the PyTorch Glm model.""" -import inspect -import tempfile import unittest -import numpy as np import pytest -from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer, GlmConfig, is_torch_available from transformers.testing_utils import ( @@ -32,7 +28,6 @@ slow, torch_device, ) -from transformers.utils import is_torch_bf16_available_on_device, is_torch_fp16_available_on_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -421,303 +416,6 @@ def test_custom_4d_attention_mask(self): torch.testing.assert_close(normalized_0, normalized_1, rtol=1e-3, atol=1e-3) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - @require_torch_sdpa - @slow - @is_flaky - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - """Overwrite to add flakyness: some cases can sometimes fail""" - if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): - self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") - - if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): - self.skipTest( - f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" - ) - - # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. - if torch_dtype == "float16": - torch_dtype = torch.float16 - elif torch_dtype == "bfloat16": - torch_dtype = torch.bfloat16 - elif torch_dtype == "float32": - torch_dtype = torch.float32 - - atols = { - ("cpu", False, torch.float32): 1e-6, - ("cpu", False, torch.bfloat16): 1e-2, - ("cpu", True, torch.float32): 1e-6, - ("cpu", True, torch.bfloat16): 1e-2, - ("cuda", False, torch.float32): 1e-6, - ("cuda", False, torch.bfloat16): 1e-2, - ("cuda", False, torch.float16): 5e-3, - ("cuda", True, torch.float32): 1e-6, - ("cuda", True, torch.bfloat16): 1e-2, - ("cuda", True, torch.float16): 5e-3, - } - rtols = { - ("cpu", False, torch.float32): 1e-4, - ("cpu", False, torch.bfloat16): 1e-2, - ("cpu", True, torch.float32): 1e-4, - ("cpu", True, torch.bfloat16): 1e-2, - ("cuda", False, torch.float32): 1e-4, - ("cuda", False, torch.bfloat16): 1e-2, - ("cuda", False, torch.float16): 5e-3, - ("cuda", True, torch.float32): 1e-4, - ("cuda", True, torch.bfloat16): 3e-2, - ("cuda", True, torch.float16): 5e-3, - } - - def get_mean_reldiff(failcase, x, ref, atol, rtol): - return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" - - for model_class in self.all_model_classes: - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors. - # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask. - # This means that the class needs to be instantiated much later, after `use_mask` is set, which means a significant refactor of the code. - # However masking there is not done at any layers that matters (i.e self-attention), therefore we can safely deactivate it. - deactivate_mask = "use_mask_token" in inspect.signature(model_class).parameters - - is_encoder_decoder = model.config.is_encoder_decoder - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) - model_sdpa = model_sdpa.eval().to(torch_device) - - self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") - - model_eager = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch_dtype, - attn_implementation="eager", - ) - model_eager = model_eager.eval().to(torch_device) - - self.assertTrue(model_eager.config._attn_implementation == "eager") - - for name, submodule in model_eager.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - raise ValueError("The eager model should not have SDPA attention layers") - - has_sdpa = False - for name, submodule in model_sdpa.named_modules(): - class_name = submodule.__class__.__name__ - if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name: - has_sdpa = True - break - if not has_sdpa and model_sdpa.config.model_type != "falcon": - raise ValueError("The SDPA model should have SDPA attention layers") - - # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model, - # but it would be nicer to have an efficient way to use parameterized.expand - fail_cases = [] - for padding_side in ["left", "right"]: - for use_mask in [False, True]: - for output_attentions in [True, False]: - can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters - if not (self.has_attentions and can_output_attn) and output_attentions: - continue - for batch_size in [1, 5]: - dummy_input = inputs_dict[model.main_input_name] - - if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: - dummy_input = dummy_input.to(torch_dtype) - - dummy_input = dummy_input[:batch_size] - if dummy_input.shape[0] != batch_size: - if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: - extension = torch.rand( - batch_size - dummy_input.shape[0], - *dummy_input.shape[1:], - dtype=torch_dtype, - device=torch_device, - ) - dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) - else: - extension = torch.randint( - high=5, - size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]), - dtype=dummy_input.dtype, - device=torch_device, - ) - dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) - - if not use_mask: - dummy_attention_mask = None - else: - dummy_attention_mask = inputs_dict.get("attention_mask", None) - if dummy_attention_mask is None: - if is_encoder_decoder: - seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] - else: - seqlen = dummy_input.shape[-1] - dummy_attention_mask = ( - torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) - ) - - dummy_attention_mask = dummy_attention_mask[:batch_size] - if dummy_attention_mask.shape[0] != batch_size: - extension = torch.ones( - batch_size - dummy_attention_mask.shape[0], - *dummy_attention_mask.shape[1:], - dtype=dummy_attention_mask.dtype, - device=torch_device, - ) - dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) - dummy_attention_mask = dummy_attention_mask.to(torch_device) - - dummy_attention_mask[:] = 1 - if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 - elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 - - for enable_kernels in [False, True]: - failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" - if is_encoder_decoder: - decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[ - :batch_size - ] - if decoder_input_ids.shape[0] != batch_size: - extension = torch.ones( - batch_size - decoder_input_ids.shape[0], - *decoder_input_ids.shape[1:], - dtype=decoder_input_ids.dtype, - device=torch_device, - ) - decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0) - decoder_input_ids = decoder_input_ids.to(torch_device) - - # TODO: never an `attention_mask` arg here? - processed_inputs = { - model.main_input_name: dummy_input, - "decoder_input_ids": decoder_input_ids, - "decoder_attention_mask": dummy_attention_mask, - "output_hidden_states": True, - } - else: - processed_inputs = { - model.main_input_name: dummy_input, - "output_hidden_states": True, - } - - # Otherwise fails for e.g. WhisperEncoderModel - if "attention_mask" in inspect.signature(model_eager.forward).parameters: - processed_inputs["attention_mask"] = dummy_attention_mask - - if ( - self.has_attentions - and "output_attentions" in inspect.signature(model_sdpa.forward).parameters - ): - processed_inputs["output_attentions"] = output_attentions - if not deactivate_mask and ( - "bool_masked_pos" in inspect.signature(model_eager.forward).parameters - ): - dummy_mask = torch.ones((self.model_tester.num_masks,)) - - # In case of additional token (like class) we define a custom `mask_length` - if hasattr(self.model_tester, "mask_length"): - mask_length = self.model_tester.mask_length - dummy_mask.size(0) - else: - mask_length = self.model_tester.seq_length - dummy_mask.size(0) - dummy_mask = torch.cat([dummy_mask, torch.zeros(mask_length)]) - dummy_bool_masked_pos = dummy_mask.expand(batch_size, -1).bool() - processed_inputs["bool_masked_pos"] = dummy_bool_masked_pos.to(torch_device) - - if "noise" in inspect.signature(model_eager.forward).parameters: - np.random.seed(2) - num_patches = int( - (self.model_tester.image_size // self.model_tester.patch_size) ** 2 - ) - noise = np.random.uniform(size=(batch_size, num_patches)) - processed_inputs["noise"] = torch.from_numpy(noise) - - # TODO: test gradients as well (& for FA2 as well!) - with torch.no_grad(): - with torch.backends.cuda.sdp_kernel( - enable_flash=enable_kernels, - enable_math=True, - enable_mem_efficient=enable_kernels, - ): - prepared_inputs = self._prepare_for_class(processed_inputs, model_class) - outputs_eager = model_eager(**prepared_inputs) - outputs_sdpa = model_sdpa(**prepared_inputs) - - logits_eager = ( - outputs_eager.hidden_states[-1] - if not is_encoder_decoder - else outputs_eager.decoder_hidden_states[-1] - ) - logits_sdpa = ( - outputs_sdpa.hidden_states[-1] - if not is_encoder_decoder - else outputs_sdpa.decoder_hidden_states[-1] - ) - - if torch_device in ["cpu", "cuda"]: - atol = atols[torch_device, enable_kernels, torch_dtype] - rtol = rtols[torch_device, enable_kernels, torch_dtype] - else: - atol = 1e-7 - rtol = 1e-4 - - # Masked tokens output slightly deviates - we don't mind that. - if use_mask: - if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) - elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) - - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) - - self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) - @slow @require_torch_accelerator diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 97b59f5aa50621..60eb964927278a 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -25,7 +25,6 @@ require_read_token, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -445,15 +444,6 @@ def test_use_flash_attention_2_true(self): if not has_flash: raise ValueError("The flash model should have flash attention layers") - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - """ - skipping the test since mup is very flaky and gets consistently different outputs - """ - self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs") - @require_torch_gpu class GraniteIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index f2f76b9fa75bf3..97af65667ed048 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -25,7 +25,6 @@ require_read_token, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -444,15 +443,6 @@ def test_use_flash_attention_2_true(self): if not has_flash: raise ValueError("The flash model should have flash attention layers") - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - @require_torch_sdpa - @slow - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - """ - skipping the test since mup is very flaky and gets consistently different outputs - """ - self.skipTest("skipping the test since mup is very flaky and gets consistently different outputs") - @require_torch_gpu class GraniteMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/groupvit/test_modeling_groupvit.py b/tests/models/groupvit/test_modeling_groupvit.py index ce31bc44a611d2..88b55ec56d8233 100644 --- a/tests/models/groupvit/test_modeling_groupvit.py +++ b/tests/models/groupvit/test_modeling_groupvit.py @@ -559,11 +559,18 @@ class GroupViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase def setUp(self): self.model_tester = GroupViTModelTester(self) + common_properties = ["projection_dim", "projection_intermediate_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=GroupViTConfig, has_text_modality=False, common_properties=common_properties + ) def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="hidden_states are tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 7be87fd78390ab..12004cc3c8ad89 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -134,7 +134,7 @@ def __init__( num_attention_heads=self.vision_num_attention_heads, num_hidden_layers=self.vision_num_hidden_layers, intermediate_size=self.vision_intermediate_size, - ) + ).to_dict() self.perceiver_qk_layer_norms_perceiver = perceiver_qk_layer_norms_perceiver self.perceiver_resampler_depth = perceiver_resampler_depth @@ -316,7 +316,6 @@ def prepare_pixel_values(self): return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) @require_torch_sdpa - @slow @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) def test_eager_matches_sdpa_inference(self, torch_dtype: str): self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test") @@ -353,6 +352,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.") + def test_eager_matches_sdpa_inference(self): + pass + def test_model_outputs_equivalence(self): try: orig = self.all_model_classes @@ -602,6 +607,12 @@ def setUp(self): ) self.config_tester = ConfigTester(self, config_class=IdeficsConfig, hidden_size=37) + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @unittest.skip("Idefics requires both text and image inputs which is currently not done in this test.") + def test_eager_matches_sdpa_inference(self, torch_dtype): + pass + @pytest.mark.generate def test_left_padding_compatibility(self): """Overwrite because IDEFICS needs image attention mask to be also padded""" diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py index 3dcd0bf5fbcdeb..ae8c91f29d4d46 100644 --- a/tests/models/idefics2/test_modeling_idefics2.py +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -185,7 +185,12 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics2VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics2Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/idefics3/test_modeling_idefics3.py b/tests/models/idefics3/test_modeling_idefics3.py index 598f5882470e99..5bfd4c3f3c0e83 100644 --- a/tests/models/idefics3/test_modeling_idefics3.py +++ b/tests/models/idefics3/test_modeling_idefics3.py @@ -168,7 +168,12 @@ class Idefics3ModelTest(ModelTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Idefics3VisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=Idefics3Config, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=Idefics3Config, has_text_modality=False, common_properties=["image_token_id"] + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip(reason="input_embeds cannot be passed in without input_ids") def test_inputs_embeds(): diff --git a/tests/models/instructblip/test_modeling_instructblip.py b/tests/models/instructblip/test_modeling_instructblip.py index f06caeb03778ee..e77577dad7877b 100644 --- a/tests/models/instructblip/test_modeling_instructblip.py +++ b/tests/models/instructblip/test_modeling_instructblip.py @@ -486,6 +486,15 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene def setUp(self): self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self) + self.config_tester = ConfigTester( + self, + config_class=InstructBlipConfig, + has_text_modality=False, + common_properties=["num_query_tokens", "image_token_index"], + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py index 7e0bf4eaf0a20d..3be5f89325cf38 100644 --- a/tests/models/instructblipvideo/test_modeling_instructblipvideo.py +++ b/tests/models/instructblipvideo/test_modeling_instructblipvideo.py @@ -510,11 +510,18 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest( def setUp(self): self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self) + common_properties = ["num_query_tokens", "video_token_index"] + self.config_tester = ConfigTester( + self, config_class=InstructBlipVideoConfig, has_text_modality=False, common_properties=common_properties + ) def test_for_conditional_generation(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_conditional_generation(*config_and_inputs) + def test_config(self): + self.config_tester.run_common_tests() + @unittest.skip(reason="Hidden_states is tested in individual model tests") def test_hidden_states_output(self): pass diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py index 43266a750b8d6c..7ede47a348d55b 100644 --- a/tests/models/kosmos2/test_modeling_kosmos2.py +++ b/tests/models/kosmos2/test_modeling_kosmos2.py @@ -304,7 +304,12 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): def setUp(self): self.model_tester = Kosmos2ModelTester(self) - self.config_tester = ConfigTester(self, config_class=Kosmos2Config, hidden_size=37) + self.config_tester = ConfigTester( + self, config_class=Kosmos2Config, has_text_modality=False, common_properties=["latent_query_num"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite from common to skip `image_to_text_projection.latent_query` def test_initialization(self): diff --git a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py index 9e39cd0279d608..7dcf5399703103 100644 --- a/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py +++ b/tests/models/layoutlmv2/test_tokenization_layoutlmv2.py @@ -2497,3 +2497,7 @@ def test_chat_template(self): @unittest.skip("Chat is not supported") def test_chat_template_return_assistant_tokens_mask(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py index 4a218d3f211146..9af0861536f73d 100644 --- a/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py +++ b/tests/models/layoutlmv3/test_tokenization_layoutlmv3.py @@ -2450,3 +2450,7 @@ def test_chat_template(self): @unittest.skip("Chat is not supported") def test_chat_template_return_assistant_tokens_mask(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/tests/models/layoutxlm/test_tokenization_layoutxlm.py b/tests/models/layoutxlm/test_tokenization_layoutxlm.py index 9f6d65ffc5f0a1..f387e52790fce3 100644 --- a/tests/models/layoutxlm/test_tokenization_layoutxlm.py +++ b/tests/models/layoutxlm/test_tokenization_layoutxlm.py @@ -1991,3 +1991,7 @@ def test_chat_template(self): @unittest.skip("Chat is not supported") def test_chat_template_return_assistant_tokens_mask(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index 9810ff7c2a56d4..1359e16a3d7b03 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -194,7 +194,13 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM def setUp(self): self.model_tester = LlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 2146c94c18a4b4..82508f57e0f1f5 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -90,7 +90,7 @@ def __init__( }, is_training=True, vision_config={ - "image_size": 16, + "image_size": 8, "patch_size": 4, "num_channels": 3, "is_training": True, @@ -123,10 +123,10 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 95 - self.image_grid_pinpoints = [[32, 32]] - self.num_image_tokens = 88 + self.image_grid_pinpoints = [[16, 16]] + self.num_image_tokens = 24 self.seq_length = seq_length + self.num_image_tokens + self.encoder_seq_length = self.seq_length def get_config(self): return LlavaNextConfig( @@ -223,7 +223,13 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes def setUp(self): self.model_tester = LlavaNextVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_next_video/test_modeling_llava_next_video.py b/tests/models/llava_next_video/test_modeling_llava_next_video.py index 89cdce65ece95d..83caabe16bb68b 100644 --- a/tests/models/llava_next_video/test_modeling_llava_next_video.py +++ b/tests/models/llava_next_video/test_modeling_llava_next_video.py @@ -91,7 +91,7 @@ def __init__( }, is_training=True, vision_config={ - "image_size": 16, + "image_size": 8, "patch_size": 4, "num_channels": 3, "is_training": True, @@ -125,10 +125,10 @@ def __init__( self.batch_size = 3 self.num_channels = 3 self.image_size = 30 - self.encoder_seq_length = 127 - self.image_grid_pinpoints = [[32, 32]] - self.num_image_tokens = 88 - self.num_video_tokens = 32 + + self.image_grid_pinpoints = [[16, 16]] + self.num_image_tokens = 24 + self.num_video_tokens = 8 self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens def get_config(self): @@ -240,7 +240,13 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaNextVideoVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaNextVideoConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=LlavaNextVideoConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/llava_onevision/test_modeling_llava_onevision.py b/tests/models/llava_onevision/test_modeling_llava_onevision.py index 7a5781fa039b5b..a217eee2c70671 100644 --- a/tests/models/llava_onevision/test_modeling_llava_onevision.py +++ b/tests/models/llava_onevision/test_modeling_llava_onevision.py @@ -226,7 +226,13 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati def setUp(self): self.model_tester = LlavaOnevisionVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=LlavaOnevisionConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer"] + self.config_tester = ConfigTester( + self, config_class=LlavaOnevisionConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/markuplm/test_tokenization_markuplm.py b/tests/models/markuplm/test_tokenization_markuplm.py index 60c98776b2a407..eaf30131d34054 100644 --- a/tests/models/markuplm/test_tokenization_markuplm.py +++ b/tests/models/markuplm/test_tokenization_markuplm.py @@ -2330,3 +2330,7 @@ def test_added_tokens_serialization(self): @unittest.skip("Chat is not supported") def test_chat_template_return_assistant_tokens_mask(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/tests/models/mimi/test_modeling_mimi.py b/tests/models/mimi/test_modeling_mimi.py index df0007d666a077..7ddc6b747447a3 100644 --- a/tests/models/mimi/test_modeling_mimi.py +++ b/tests/models/mimi/test_modeling_mimi.py @@ -409,10 +409,14 @@ def test_identity_shortcut(self): config.use_conv_shortcut = False self.model_tester.create_and_check_model_forward(config, inputs_dict) + # Overwrite to use `audio_values` as the tensors to compare. + # TODO: Try to do this in the parent class. @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow def test_eager_matches_sdpa_inference(self, torch_dtype: str): + if torch_dtype == "float16" and torch_device == "cpu": + self.skipTest("`replication_pad1d` not implemented for 'Half") + if not self.has_attentions: self.skipTest(reason="Model architecture does not support attentions") @@ -513,7 +517,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): can_output_attn = "output_attentions" in inspect.signature(model_sdpa.forward).parameters if not (self.has_attentions and can_output_attn) and output_attentions: continue - for batch_size in [1, 5]: + for batch_size in [7]: dummy_input = inputs_dict[model.main_input_name] if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: @@ -564,11 +568,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): dummy_attention_mask[:] = 1 if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 + dummy_attention_mask[-1, :2] = 0 + dummy_attention_mask[-1, 2:] = 1 elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 + dummy_attention_mask[-1, -2:] = 0 + dummy_attention_mask[-1, :-2] = 1 for enable_kernels in [False, True]: failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" @@ -655,52 +659,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # Masked tokens output slightly deviates - we don't mind that. if use_mask: + _logits_sdpa = torch.zeros_like(input=logits_sdpa) + _logits_eager = torch.zeros_like(input=logits_eager) + + _logits_sdpa[:-1] = logits_sdpa[:-1] + _logits_eager[:-1] = logits_eager[:-1] + if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) - elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:] + _logits_eager[-1:, 2:] = logits_eager[-1:, 2:] - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) + elif padding_side == "right": + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2] + _logits_eager[-1:, 2:] = logits_eager[-1:, :-2] + + logits_sdpa = _logits_sdpa + logits_eager = _logits_eager + + results = [ + torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol) + for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager) + ] + # If 80% batch elements have matched results, it's fine + if np.mean(results) < 0.8: + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) diff --git a/tests/models/mllama/test_modeling_mllama.py b/tests/models/mllama/test_modeling_mllama.py index 91f2169a02f42d..8da927f815db81 100644 --- a/tests/models/mllama/test_modeling_mllama.py +++ b/tests/models/mllama/test_modeling_mllama.py @@ -30,12 +30,10 @@ from transformers.models.mllama.configuration_mllama import MllamaTextConfig from transformers.testing_utils import ( cleanup, - is_flaky, require_bitsandbytes, require_read_token, require_torch, require_torch_gpu, - require_torch_sdpa, slow, torch_device, ) @@ -272,7 +270,12 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester def setUp(self): self.model_tester = MllamaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=MllamaConfig, has_text_modality=False) + self.config_tester = ConfigTester( + self, config_class=MllamaConfig, has_text_modality=False, common_properties=["image_token_index"] + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): @@ -354,13 +357,6 @@ def _check_attentions_for_generate( self.assertListEqual([layer_attention.shape for layer_attention in iter_attentions], expected_shapes) - @require_torch_sdpa - @slow - @is_flaky() - def test_eager_matches_sdpa_inference_1_bfloat16(self): - # A workaround to override parametrized test with flaky decorator - super().test_eager_matches_sdpa_inference_1_bfloat16() - @unittest.skip("For some unknown reasons the tests fails in CrossAttention layer when doing torch.sdpa(). ") def test_sdpa_can_compile_dynamic(self): pass diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index 963cace28d6e41..37b5af3ae7e312 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -452,7 +452,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.has_attentions: @@ -479,8 +478,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): atols = { ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-6, ("cuda", False, torch.bfloat16): 1e-2, @@ -491,8 +492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): } rtols = { ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-4, ("cuda", False, torch.bfloat16): 1e-2, @@ -528,7 +531,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): fail_cases = [] for padding_side in ["left", "right"]: for use_mask in [False, True]: - for batch_size in [1, 5]: + for batch_size in [7]: # Ignore copy batch_size_input_ids = self.model_tester.num_codebooks * batch_size dummy_input = inputs_dict[model.main_input_name] @@ -585,11 +588,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): dummy_attention_mask[:] = 1 if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 + dummy_attention_mask[-1, :2] = 0 + dummy_attention_mask[-1, 2:] = 1 elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 + dummy_attention_mask[-1, -2:] = 0 + dummy_attention_mask[-1, :-2] = 1 for enable_kernels in [False, True]: failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" @@ -632,52 +635,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # Masked tokens output slightly deviates - we don't mind that. if use_mask: + _logits_sdpa = torch.zeros_like(input=logits_sdpa) + _logits_eager = torch.zeros_like(input=logits_eager) + + _logits_sdpa[:-1] = logits_sdpa[:-1] + _logits_eager[:-1] = logits_eager[:-1] + if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:] + _logits_eager[-1:, 2:] = logits_eager[-1:, 2:] + elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2] + _logits_eager[-1:, 2:] = logits_eager[-1:, :-2] - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) + logits_sdpa = _logits_sdpa + logits_eager = _logits_eager + + results = [ + torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol) + for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager) + ] + # If 80% batch elements have matched results, it's fine + if np.mean(results) < 0.8: + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) @@ -1496,8 +1479,6 @@ def test_sdpa_can_dispatch_composite_models(self): @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow - # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.has_attentions: self.skipTest(reason="Model architecture does not support attentions") @@ -1523,8 +1504,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): atols = { ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-6, ("cuda", False, torch.bfloat16): 1e-2, @@ -1535,8 +1518,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): } rtols = { ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-4, ("cuda", False, torch.bfloat16): 1e-2, @@ -1549,8 +1534,26 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): def get_mean_reldiff(failcase, x, ref, atol, rtol): return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + if hasattr(self.model_tester, "num_hidden_layers"): + self.model_tester.num_hidden_layers = 1 + for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + config.rms_norm_eps = 1.0 + config.layer_norm_eps = 1.0 + config.norm_eps = 1.0 + config.norm_epsilon = 1.0 + config.layer_norm_epsilon = 1.0 + + for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]: + if hasattr(config, attr): + getattr(config, attr).rms_norm_eps = 1.0 + getattr(config, attr).layer_norm_eps = 1.0 + getattr(config, attr).norm_eps = 1.0 + getattr(config, attr).norm_epsilon = 1.0 + getattr(config, attr).layer_norm_epsilon = 1.0 + model = model_class(config) is_encoder_decoder = model.config.is_encoder_decoder @@ -1567,12 +1570,19 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): ) model_eager = model_eager.eval().to(torch_device) + for x in model_eager.modules(): + if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)): + x.eps = 1.0 + for x in model_sdpa.modules(): + if isinstance(x, (torch.nn.LayerNorm, torch.nn.GroupNorm)): + x.eps = 1.0 + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, # but it would be nicer to have an efficient way to use parameterized.expand fail_cases = [] for padding_side in ["left", "right"]: for use_mask in [False, True]: - for batch_size in [1, 5]: + for batch_size in [7]: dummy_input = inputs_dict[model.main_input_name] if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: @@ -1622,11 +1632,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): dummy_attention_mask[:] = 1 if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 + dummy_attention_mask[-1, :2] = 0 + dummy_attention_mask[-1, 2:] = 1 elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 + dummy_attention_mask[-1, -2:] = 0 + dummy_attention_mask[-1, :-2] = 1 for enable_kernels in [False, True]: failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" @@ -1687,52 +1697,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # Masked tokens output slightly deviates - we don't mind that. if use_mask: + _logits_sdpa = torch.zeros_like(input=logits_sdpa) + _logits_eager = torch.zeros_like(input=logits_eager) + + _logits_sdpa[:-1] = logits_sdpa[:-1] + _logits_eager[:-1] = logits_eager[:-1] + if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:] + _logits_eager[-1:, 2:] = logits_eager[-1:, 2:] + elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2] + _logits_eager[-1:, 2:] = logits_eager[-1:, :-2] - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) + logits_sdpa = _logits_sdpa + logits_eager = _logits_eager + + results = [ + torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol) + for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager) + ] + # If 80% batch elements have matched results, it's fine + if np.mean(results) < 0.8: + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 957db9f23b0f21..de7a2745ca073f 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -460,7 +460,6 @@ def test_flash_attn_2_inference_equivalence_right_padding(self): @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.has_attentions: @@ -487,8 +486,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): atols = { ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-6, ("cuda", False, torch.bfloat16): 1e-2, @@ -499,8 +500,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): } rtols = { ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-4, ("cuda", False, torch.bfloat16): 1e-2, @@ -536,7 +539,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): fail_cases = [] for padding_side in ["left", "right"]: for use_mask in [False, True]: - for batch_size in [1, 5]: + for batch_size in [7]: # Ignore copy batch_size_input_ids = self.model_tester.num_codebooks * batch_size dummy_input = inputs_dict[model.main_input_name] @@ -593,11 +596,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): dummy_attention_mask[:] = 1 if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 + dummy_attention_mask[-1, :2] = 0 + dummy_attention_mask[-1, 2:] = 1 elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 + dummy_attention_mask[-1, -2:] = 0 + dummy_attention_mask[-1, :-2] = 1 for enable_kernels in [False, True]: failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" @@ -640,52 +643,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # Masked tokens output slightly deviates - we don't mind that. if use_mask: + _logits_sdpa = torch.zeros_like(input=logits_sdpa) + _logits_eager = torch.zeros_like(input=logits_eager) + + _logits_sdpa[:-1] = logits_sdpa[:-1] + _logits_eager[:-1] = logits_eager[:-1] + if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:] + _logits_eager[-1:, 2:] = logits_eager[-1:, 2:] + elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2] + _logits_eager[-1:, 2:] = logits_eager[-1:, :-2] - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) + logits_sdpa = _logits_sdpa + logits_eager = _logits_eager + + results = [ + torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol) + for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager) + ] + # If 80% batch elements have matched results, it's fine + if np.mean(results) < 0.8: + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) @@ -1486,7 +1469,6 @@ def test_sdpa_can_dispatch_composite_models(self): @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.all_model_classes[0]._supports_sdpa: @@ -1510,8 +1492,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): atols = { ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-6, ("cuda", False, torch.bfloat16): 1e-2, @@ -1522,8 +1506,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): } rtols = { ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-4, ("cuda", False, torch.bfloat16): 1e-2, @@ -1559,7 +1545,7 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): fail_cases = [] for padding_side in ["left", "right"]: for use_mask in [False, True]: - for batch_size in [1, 5]: + for batch_size in [7]: dummy_input = inputs_dict[model.main_input_name] if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: @@ -1609,11 +1595,11 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): dummy_attention_mask[:] = 1 if padding_side == "left": - dummy_attention_mask[-1, :-1] = 1 - dummy_attention_mask[-1, -4:] = 0 + dummy_attention_mask[-1, :2] = 0 + dummy_attention_mask[-1, 2:] = 1 elif padding_side == "right": - dummy_attention_mask[-1, 1:] = 1 - dummy_attention_mask[-1, :3] = 0 + dummy_attention_mask[-1, -2:] = 0 + dummy_attention_mask[-1, :-2] = 1 for enable_kernels in [False, True]: failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" @@ -1674,52 +1660,32 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): # Masked tokens output slightly deviates - we don't mind that. if use_mask: + _logits_sdpa = torch.zeros_like(input=logits_sdpa) + _logits_eager = torch.zeros_like(input=logits_eager) + + _logits_sdpa[:-1] = logits_sdpa[:-1] + _logits_eager[:-1] = logits_eager[:-1] + if padding_side == "left": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, :-4] - sub_eager = logits_eager[-1, :-4] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, -4:] - # sub_eager = logits_eager[-1, -4:] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, 2:] + _logits_eager[-1:, 2:] = logits_eager[-1:, 2:] + elif padding_side == "right": - sub_sdpa = logits_sdpa[:-1] - sub_eager = logits_eager[:-1] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - sub_sdpa = logits_sdpa[-1, 3:] - sub_eager = logits_eager[-1, 3:] - if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) - ) - - # Testing the padding tokens is not really meaningful but anyway - # sub_sdpa = logits_sdpa[-1, :3] - # sub_eager = logits_eager[-1, :3] - # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): - # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + _logits_sdpa[-1:, 2:] = logits_sdpa[-1:, :-2] + _logits_eager[-1:, 2:] = logits_eager[-1:, :-2] - else: - if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): - fail_cases.append( - get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) - ) + logits_sdpa = _logits_sdpa + logits_eager = _logits_eager + + results = [ + torch.allclose(_logits_sdpa, _logits_eager, atol=atol, rtol=rtol) + for (_logits_sdpa, _logits_eager) in zip(logits_sdpa, logits_eager) + ] + # If 80% batch elements have matched results, it's fine + if np.mean(results) < 0.8: + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) diff --git a/tests/models/nemotron/test_modeling_nemotron.py b/tests/models/nemotron/test_modeling_nemotron.py index 37a581a33866ce..fd62c74d3d6e11 100644 --- a/tests/models/nemotron/test_modeling_nemotron.py +++ b/tests/models/nemotron/test_modeling_nemotron.py @@ -19,7 +19,6 @@ import unittest import pytest -from parameterized import parameterized from transformers import NemotronConfig, is_torch_available from transformers.testing_utils import ( @@ -99,15 +98,6 @@ def setUp(self): self.model_tester = NemotronModelTester(self) self.config_tester = ConfigTester(self, config_class=NemotronConfig, hidden_size=37) - @require_torch_sdpa - @slow - @unittest.skip( - reason="Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16." - ) - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - pass - @unittest.skip("Eager and SDPA do not produce the same outputs, thus this test fails") def test_model_outputs_equivalence(self, **kwargs): pass diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py index 48070c7bb86c6b..df763aed48c749 100644 --- a/tests/models/owlv2/test_modeling_owlv2.py +++ b/tests/models/owlv2/test_modeling_owlv2.py @@ -447,6 +447,13 @@ class Owlv2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = Owlv2ModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=Owlv2Config, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/owlvit/test_modeling_owlvit.py b/tests/models/owlvit/test_modeling_owlvit.py index a08fae0bc6d10e..e0599a50fb98b4 100644 --- a/tests/models/owlvit/test_modeling_owlvit.py +++ b/tests/models/owlvit/test_modeling_owlvit.py @@ -442,6 +442,13 @@ class OwlViTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = OwlViTModelTester(self) + common_properties = ["projection_dim", "logit_scale_init_value"] + self.config_tester = ConfigTester( + self, config_class=OwlViTConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/models/paligemma/test_modeling_paligemma.py b/tests/models/paligemma/test_modeling_paligemma.py index 074e0083fd0202..ce44436a20ad2c 100644 --- a/tests/models/paligemma/test_modeling_paligemma.py +++ b/tests/models/paligemma/test_modeling_paligemma.py @@ -17,7 +17,6 @@ import unittest import requests -from parameterized import parameterized from transformers import ( PaliGemmaConfig, @@ -30,7 +29,6 @@ cleanup, require_read_token, require_torch, - require_torch_sdpa, slow, torch_device, ) @@ -301,14 +299,6 @@ def test_disk_offload_safetensors(self): def test_model_parallelism(self): pass - @require_torch_sdpa - @slow - @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) - def test_eager_matches_sdpa_inference(self, torch_dtype: str): - self.skipTest( - "Due to custom causal mask, there is a slightly too big difference between eager and sdpa in bfloat16." - ) - @unittest.skip( reason="PaliGemmma's SigLip encoder uses the same initialization scheme as the Flax original implementation" ) diff --git a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py index 6c04ba40df19d6..c3902c9e75bc66 100644 --- a/tests/models/qwen2_vl/test_modeling_qwen2_vl.py +++ b/tests/models/qwen2_vl/test_modeling_qwen2_vl.py @@ -66,12 +66,12 @@ def __init__( bos_token_id=0, eos_token_id=1, pad_token_id=2, - vision_start_token_id=151652, - image_token_id=151655, - video_token_id=151656, + vision_start_token_id=3, + image_token_id=4, + video_token_id=5, hidden_act="silu", hidden_size=32, - vocab_size=152064, + vocab_size=99, intermediate_size=37, max_position_embeddings=512, max_window_layers=3, @@ -166,6 +166,8 @@ def prepare_config_and_inputs_for_common(self): input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device) + input_ids[:, -1] = self.pad_token_id + input_ids[input_ids == self.video_token_id] = self.pad_token_id input_ids[input_ids == self.image_token_id] = self.pad_token_id input_ids[:, self.num_image_tokens] = self.image_token_id labels = torch.zeros( @@ -232,6 +234,9 @@ def setUp(self): self.model_tester = Qwen2VLVisionText2TextModelTester(self) self.config_tester = ConfigTester(self, config_class=Qwen2VLConfig, has_text_modality=False) + def test_config(self): + self.config_tester.run_common_tests() + def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/models/siglip/test_modeling_siglip.py b/tests/models/siglip/test_modeling_siglip.py index 2fe06b1511a471..61ac78f102994a 100644 --- a/tests/models/siglip/test_modeling_siglip.py +++ b/tests/models/siglip/test_modeling_siglip.py @@ -667,9 +667,12 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test test_disk_offload_bin = False _is_composite = True - # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip def setUp(self): self.model_tester = SiglipModelTester(self) + self.config_tester = ConfigTester(self, config_class=SiglipConfig, has_text_modality=False) + + def test_config(self): + self.config_tester.run_common_tests() # Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.test_model def test_model(self): diff --git a/tests/models/tapas/test_tokenization_tapas.py b/tests/models/tapas/test_tokenization_tapas.py index 49327a39cd80d3..0a911f7182b4a0 100644 --- a/tests/models/tapas/test_tokenization_tapas.py +++ b/tests/models/tapas/test_tokenization_tapas.py @@ -1290,3 +1290,7 @@ def test_chat_template(self): @unittest.skip("Chat is not supported") def test_chat_template_return_assistant_tokens_mask(self): pass + + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass diff --git a/tests/models/udop/test_tokenization_udop.py b/tests/models/udop/test_tokenization_udop.py index 90d669064a0fd1..a6ac2ff3d38096 100644 --- a/tests/models/udop/test_tokenization_udop.py +++ b/tests/models/udop/test_tokenization_udop.py @@ -1161,6 +1161,10 @@ def test_chat_template(self): def test_chat_template_return_assistant_tokens_mask(self): pass + @unittest.skip("Chat is not supported") + def test_chat_template_return_assistant_tokens_mask_truncated(self): + pass + @unittest.skip(reason="Chat template tests don't play well with table/layout models.") def test_chat_template_batched(self): pass diff --git a/tests/models/video_llava/test_modeling_video_llava.py b/tests/models/video_llava/test_modeling_video_llava.py index 0044ef02720c2b..090907b164e80d 100644 --- a/tests/models/video_llava/test_modeling_video_llava.py +++ b/tests/models/video_llava/test_modeling_video_llava.py @@ -57,8 +57,8 @@ def __init__( image_token_index=0, video_token_index=1, projector_hidden_act="gelu", - seq_length=13, - num_frames=8, + seq_length=3, + num_frames=2, vision_feature_select_strategy="default", vision_feature_layer=-1, text_config={ @@ -88,7 +88,7 @@ def __init__( vision_config={ "model_type": "clip_vision_model", "batch_size": 12, - "image_size": 30, + "image_size": 8, "patch_size": 6, "num_channels": 3, "is_training": True, @@ -123,10 +123,11 @@ def __init__( self.batch_size = 5 self.num_channels = 3 self.image_size = 224 - self.encoder_seq_length = 246 - self.num_image_tokens = 25 - self.num_video_tokens = 26 * self.num_frames + + self.num_image_tokens = (vision_config["image_size"] // vision_config["patch_size"]) ** 2 + self.num_video_tokens = (self.num_image_tokens + 1) * self.num_frames self.seq_length = seq_length + self.num_image_tokens + self.num_video_tokens + self.encoder_seq_length = self.seq_length def get_config(self): return VideoLlavaConfig( @@ -217,7 +218,13 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe def setUp(self): self.model_tester = VideoLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VideoLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "video_token_index", "vision_feature_layer", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VideoLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() @unittest.skip( reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124" diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py index 801990331fea53..212eae1471222f 100644 --- a/tests/models/videomae/test_modeling_videomae.py +++ b/tests/models/videomae/test_modeling_videomae.py @@ -22,7 +22,7 @@ from transformers import VideoMAEConfig from transformers.models.auto import get_values -from transformers.testing_utils import require_torch, require_vision, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_sdpa, require_vision, slow, torch_device from transformers.utils import cached_property, is_torch_available, is_vision_available from ...test_configuration_common import ConfigTester @@ -213,6 +213,11 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): return inputs_dict + @unittest.skip("`mse_cpu` not implemented for 'BFloat16'") + @require_torch_sdpa + def test_eager_matches_sdpa_inference_1_bfloat16(self): + pass + def test_config(self): self.config_tester.run_common_tests() diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py index e2f9ae1ccfdea7..25670d782a987e 100644 --- a/tests/models/vipllava/test_modeling_vipllava.py +++ b/tests/models/vipllava/test_modeling_vipllava.py @@ -179,7 +179,13 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest def setUp(self): self.model_tester = VipLlavaVisionText2TextModelTester(self) - self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False) + common_properties = ["image_token_index", "vision_feature_layers", "image_seq_length"] + self.config_tester = ConfigTester( + self, config_class=VipLlavaConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs def test_inputs_embeds(self): diff --git a/tests/models/x_clip/test_modeling_x_clip.py b/tests/models/x_clip/test_modeling_x_clip.py index 8b91019bae18cc..04dd2d9d29687a 100644 --- a/tests/models/x_clip/test_modeling_x_clip.py +++ b/tests/models/x_clip/test_modeling_x_clip.py @@ -547,6 +547,13 @@ class XCLIPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): def setUp(self): self.model_tester = XCLIPModelTester(self) + common_properties = ["projection_dim", "prompt_layers", "prompt_num_attention_heads"] + self.config_tester = ConfigTester( + self, config_class=XCLIPConfig, has_text_modality=False, common_properties=common_properties + ) + + def test_config(self): + self.config_tester.run_common_tests() def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() diff --git a/tests/quantization/ggml/test_ggml.py b/tests/quantization/ggml/test_ggml.py index da1af9bff8df90..84278e7032537b 100644 --- a/tests/quantization/ggml/test_ggml.py +++ b/tests/quantization/ggml/test_ggml.py @@ -673,10 +673,6 @@ def test_stablelm_fp16(self): self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id, torch_dtype=torch.float16, - # for precise comparison it is required to use the original model config - # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias - # and it highly influences on the output results - config=original_model.config, ) tokenizer = AutoTokenizer.from_pretrained(self.stablelm2_model_id, gguf_file=self.fp16_stablelm2_model_id) @@ -703,10 +699,6 @@ def test_stablelm_weights_conversion_fp16(self): gguf_file=self.fp16_stablelm2_model_id, device_map="auto", torch_dtype=torch.float16, - # for precise comparison it is required to use the original model config - # as quantized one is different in parameters: use_parallel_residual and use_qkv_bias - # and it highly influences on the output results - config=original_model.config, ) converted_state_dict = converted_model.state_dict() diff --git a/tests/quantization/torchao_integration/test_torchao.py b/tests/quantization/torchao_integration/test_torchao.py index c7c701e49aec14..c3ab06ee61ba59 100644 --- a/tests/quantization/torchao_integration/test_torchao.py +++ b/tests/quantization/torchao_integration/test_torchao.py @@ -74,6 +74,13 @@ def test_post_init_check(self): with self.assertRaisesRegex(ValueError, "Unexpected keyword arg"): _ = TorchAoConfig("int4_weight_only", group_size1=32) + def test_repr(self): + """ + Check that there is no error in the repr + """ + quantization_config = TorchAoConfig("int4_weight_only", modules_to_not_convert=["conv"], group_size=8) + repr(quantization_config) + @require_torch_gpu @require_torchao diff --git a/tests/test_configuration_common.py b/tests/test_configuration_common.py index 81c6a008b133ca..4dbbdedbbc2eb7 100644 --- a/tests/test_configuration_common.py +++ b/tests/test_configuration_common.py @@ -17,12 +17,17 @@ import json import os import tempfile +from pathlib import Path from transformers import is_torch_available +from transformers.utils import direct_transformers_import from .utils.test_configuration_utils import config_common_kwargs +transformers_module = direct_transformers_import(Path(__file__).parent) + + class ConfigTester: def __init__(self, parent, config_class=None, has_text_modality=True, common_properties=None, **kwargs): self.parent = parent @@ -35,9 +40,10 @@ def create_and_test_config_common_properties(self): config = self.config_class(**self.inputs_dict) common_properties = ( ["hidden_size", "num_attention_heads", "num_hidden_layers"] - if self.common_properties is None + if self.common_properties is None and not self.config_class.sub_configs else self.common_properties ) + common_properties = [] if common_properties is None else common_properties # Add common fields for text models if self.has_text_modality: @@ -110,6 +116,44 @@ def create_and_test_config_from_and_save_pretrained_subfolder(self): self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + def create_and_test_config_from_and_save_pretrained_composite(self): + """ + Tests that composite or nested cofigs can be loaded and saved correctly. In case the config + has a sub-config, we should be able to call `sub_config.from_pretrained('general_config_file')` + and get a result same as if we loaded the whole config and obtained `config.sub_config` from it. + """ + config = self.config_class(**self.inputs_dict) + + with tempfile.TemporaryDirectory() as tmpdirname: + config.save_pretrained(tmpdirname) + general_config_loaded = self.config_class.from_pretrained(tmpdirname) + general_config_dict = config.to_dict() + + # Iterate over all sub_configs if there are any and load them with their own classes + sub_configs = self.config_class.sub_configs + for sub_config_key, sub_class in sub_configs.items(): + if sub_class.__name__ == "AutoConfig": + sub_class = sub_class.for_model(**general_config_dict[sub_config_key]).__class__ + sub_config_loaded = sub_class.from_pretrained(tmpdirname) + else: + sub_config_loaded = sub_class.from_pretrained(tmpdirname) + + # Pop `transformers_version`, it never exists when a config is part of a general composite config + # Verify that loading with subconfig class results in same dict as if we loaded with general composite config class + sub_config_loaded_dict = sub_config_loaded.to_dict() + sub_config_loaded_dict.pop("transformers_version", None) + self.parent.assertEqual(sub_config_loaded_dict, general_config_dict[sub_config_key]) + + # Verify that the loaded config type is same as in the general config + type_from_general_config = type(getattr(general_config_loaded, sub_config_key)) + self.parent.assertTrue(isinstance(sub_config_loaded, type_from_general_config)) + + # Now save only the sub-config and load it back to make sure the whole load-save-load pipeline works + with tempfile.TemporaryDirectory() as tmpdirname2: + sub_config_loaded.save_pretrained(tmpdirname2) + sub_config_loaded_2 = sub_class.from_pretrained(tmpdirname2) + self.parent.assertEqual(sub_config_loaded.to_dict(), sub_config_loaded_2.to_dict()) + def create_and_test_config_with_num_labels(self): config = self.config_class(**self.inputs_dict, num_labels=5) self.parent.assertEqual(len(config.id2label), 5) @@ -128,6 +172,9 @@ def check_config_can_be_init_without_params(self): self.parent.assertIsNotNone(config) def check_config_arguments_init(self): + if self.config_class.sub_configs: + return # TODO: @raushan composite models are not consistent in how they set general params + kwargs = copy.deepcopy(config_common_kwargs) config = self.config_class(**kwargs) wrong_values = [] @@ -153,6 +200,7 @@ def run_common_tests(self): self.create_and_test_config_to_json_file() self.create_and_test_config_from_and_save_pretrained() self.create_and_test_config_from_and_save_pretrained_subfolder() + self.create_and_test_config_from_and_save_pretrained_composite() self.create_and_test_config_with_num_labels() self.check_config_can_be_init_without_params() self.check_config_arguments_init() diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 13c4d5155be445..94b5e175bf88a2 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3802,22 +3802,18 @@ def test_attn_implementation_composite_models(self): self.skipTest("Model is not a composite model.") config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - sub_configs = { - key: getattr(config, key) for key in config if isinstance(getattr(config, key), PretrainedConfig) - } # set eager as it will be the one supported in all models # we just need to test if passing 'attn_implementation' as a dict fails or not attn_implementation_per_subconfig = {} - for key, sub_config in sub_configs.items(): + for key in config.sub_configs.keys(): attn_implementation_per_subconfig[key] = "eager" config._attn_implementation = attn_implementation_per_subconfig model = model_class(config) - for key in model.config: - if isinstance(getattr(model.config, key), PretrainedConfig): - sub_config = getattr(model.config, key) - self.assertTrue(sub_config._attn_implementation == "eager") + for key in config.sub_configs.keys(): + sub_config = getattr(model.config, key) + self.assertTrue(sub_config._attn_implementation == "eager") for name, submodule in model.named_modules(): class_name = submodule.__class__.__name__ @@ -3826,7 +3822,7 @@ def test_attn_implementation_composite_models(self): or "SdpaSelfAttention" in class_name or "FlashAttention" in class_name ): - raise ValueError("The eager model should not have SDPA/FA2 attention layers") + raise ValueError(f"The eager model should not have SDPA/FA2 attention layers but got {class_name}") @require_torch_sdpa def test_sdpa_can_dispatch_non_composite_models(self): @@ -3932,7 +3928,6 @@ def test_sdpa_can_dispatch_composite_models(self): @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) @require_torch_sdpa - @slow def test_eager_matches_sdpa_inference(self, torch_dtype: str): if not self.has_attentions: self.skipTest(reason="Model architecture does not support attentions") @@ -3958,8 +3953,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): atols = { ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-6, ("cuda", False, torch.bfloat16): 1e-2, @@ -3970,8 +3967,10 @@ def test_eager_matches_sdpa_inference(self, torch_dtype: str): } rtols = { ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.float16): 5e-3, ("cpu", False, torch.bfloat16): 1e-2, ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.float16): 5e-3, ("cpu", True, torch.bfloat16): 1e-2, ("cuda", False, torch.float32): 1e-4, ("cuda", False, torch.bfloat16): 1e-2, @@ -3987,12 +3986,31 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): if hasattr(self.model_tester, "num_hidden_layers"): self.model_tester.num_hidden_layers = 1 if hasattr(self.model_tester, "vision_config") and "num_hidden_layers" in self.model_tester.vision_config: + self.model_tester.vision_config = copy.deepcopy(self.model_tester.vision_config) self.model_tester.vision_config["num_hidden_layers"] = 1 if hasattr(self.model_tester, "text_config") and "num_hidden_layers" in self.model_tester.text_config: + self.model_tester.text_config = copy.deepcopy(self.model_tester.text_config) self.model_tester.text_config["num_hidden_layers"] = 1 for model_class in self.all_model_classes: config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + config.rms_norm_eps = 1.0 + config.layer_norm_eps = 1.0 + config.norm_eps = 1.0 + config.norm_epsilon = 1.0 + config.layer_norm_epsilon = 1.0 + + # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance. + # (We don't need the original epsilon values to check eager/sdpa matches) + for attr in ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"]: + if hasattr(config, attr): + getattr(config, attr).rms_norm_eps = 1.0 + getattr(config, attr).layer_norm_eps = 1.0 + getattr(config, attr).norm_eps = 1.0 + getattr(config, attr).norm_epsilon = 1.0 + getattr(config, attr).layer_norm_epsilon = 1.0 + model = model_class(config) # FIXME: we deactivate boolean mask for models using "use_mask_token" in their constructors. # These models support masking only in the case `use_mask_token=True`. Otherwise they cannot consume an input mask. @@ -4004,14 +4022,22 @@ def get_mean_reldiff(failcase, x, ref, atol, rtol): with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) - model_sdpa = model_sdpa.eval().to(torch_device) + model_sdpa = model_sdpa.eval().to(torch_device, dtype=torch_dtype) model_eager = model_class.from_pretrained( tmpdirname, torch_dtype=torch_dtype, attn_implementation="eager", ) - model_eager = model_eager.eval().to(torch_device) + model_eager = model_eager.eval().to(torch_device, dtype=torch_dtype) + + # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.) + for x in model_eager.modules(): + if isinstance(x, (nn.LayerNorm, nn.GroupNorm)): + x.eps = 1.0 + for x in model_sdpa.modules(): + if isinstance(x, (nn.LayerNorm, nn.GroupNorm)): + x.eps = 1.0 # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 16 times the model, # but it would be nicer to have an efficient way to use parameterized.expand diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index dd9eb10de40ecf..a3bbbf3c9e97b2 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1327,6 +1327,110 @@ def test_chat_template_return_assistant_tokens_mask(self): [0] * (assistant_start2 - assistant_end - 1), ) + @require_jinja + def test_chat_template_return_assistant_tokens_mask_truncated(self): + dummy_template = ( + "{% for message in messages %}" + "{% if (message['role'] != 'assistant') %}" + "{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}" + "{% elif (message['role'] == 'assistant')%}" + "{{'<|im_start|>' + message['role'] + '\n'}}" + "{% generation %}" + "{{message['content'] + '<|im_end|>'}}" + "{% endgeneration %}" + "{{'\n'}}" + "{% endif %}" + "{% endfor %}" + ) + conversations = [ + [ + {"role": "system", "content": "system message"}, + {"role": "user", "content": "user message"}, + { + "role": "assistant", + "content": ( + "start turn assistant. long string to be truncated, long string to be truncated, " + "long string to be truncated, long string to be truncated, long string to be truncated" + ), + }, + {"role": "user", "content": "another user message"}, + ], + [ + {"role": "system", "content": "system message"}, + {"role": "user", "content": "user message"}, + { + "role": "assistant", + "content": ( + "start turn assistant. long string to be truncated, long string to be truncated, " + "long string to be truncated, long string to be truncated, long string to be truncated" + ), + }, + {"role": "user", "content": "another user message"}, + ], + ] + + for tokenizer, pretrained_name, _ in self.tokenizers_list: + with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"): + if not self.test_rust_tokenizer: + self.skipTest(reason="No fast tokenizer defined") + + tokenizer_r = self.rust_tokenizer_class.from_pretrained(pretrained_name) + + # Find where to truncate, as the amount of tokens is different for different tokenizers and I want the + # truncation to happen in the middle of the assistant content. + full_encoding = tokenizer_r.apply_chat_template( + conversations[0], + chat_template=dummy_template, + tokenize=True, + return_dict=True, + ) + chat_string = tokenizer_r.apply_chat_template( + conversations[0], tokenize=False, chat_template=dummy_template + ) + truncation_position = full_encoding.char_to_token(chat_string.index(", long string to be truncated,")) + + # check batched + output = tokenizer_r.apply_chat_template( + conversations, + chat_template=dummy_template, + tokenize=True, + return_assistant_tokens_mask=True, + max_length=truncation_position, + truncation=True, + return_dict=True, + ) + for i, conv in enumerate(conversations): + chat_string = tokenizer_r.apply_chat_template(conv, tokenize=False, chat_template=dummy_template) + assistant_start = output.char_to_token(i, chat_string.index("start turn assistant")) + + # assert 1 from assistant_start to the end because the rest is truncated. + self.assertEqual( + output["assistant_masks"][i][assistant_start:], + [1] * (len(output["assistant_masks"][i]) - assistant_start), + ) + + # check not batched + output = tokenizer_r.apply_chat_template( + conversations[0], + chat_template=dummy_template, + tokenize=True, + return_assistant_tokens_mask=True, + return_dict=True, + max_length=truncation_position, + truncation=True, + ) + + chat_string = tokenizer_r.apply_chat_template( + conversations[0], tokenize=False, chat_template=dummy_template + ) + assistant_start = output.char_to_token(0, chat_string.index("start turn assistant")) + + # assert 1 from assistant_start to the end because the rest is truncated. + self.assertEqual( + output["assistant_masks"][assistant_start:], + [1] * (len(output["assistant_masks"]) - assistant_start), + ) + @require_jinja def test_continue_final_message(self): dummy_template = """