diff --git a/optimum/bettertransformer/models/attention.py b/optimum/bettertransformer/models/attention.py index d7ded056941..a9fdb7b3e98 100644 --- a/optimum/bettertransformer/models/attention.py +++ b/optimum/bettertransformer/models/attention.py @@ -913,6 +913,7 @@ def falcon_forward( alibi: Optional[torch.Tensor], attention_mask: torch.Tensor, layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, + position_ids: Optional[torch.LongTensor] = None, head_mask: Optional[torch.Tensor] = None, use_cache: bool = False, output_attentions: bool = False, @@ -937,7 +938,7 @@ def falcon_forward( value_layer = value_layer.transpose(1, 2).reshape(batch_size * num_kv_heads, query_length, self.head_dim) past_kv_length = 0 if layer_past is None else layer_past[0].shape[1] - query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length) + query_layer, key_layer = self.maybe_rotary(query_layer, key_layer, past_kv_length, position_ids) if layer_past is not None: past_key, past_value = layer_past diff --git a/optimum/bettertransformer/models/decoder_models.py b/optimum/bettertransformer/models/decoder_models.py index 47aacd17b61..adcbfb4fd87 100644 --- a/optimum/bettertransformer/models/decoder_models.py +++ b/optimum/bettertransformer/models/decoder_models.py @@ -44,7 +44,7 @@ else: from ...utils.dummy_bettertransformer_objects import BarkSelfAttention -if check_if_transformers_greater("4.32"): +if check_if_transformers_greater("4.34"): from transformers.models.falcon.modeling_falcon import FalconAttention else: from ...utils.dummy_bettertransformer_objects import FalconAttention diff --git a/optimum/utils/dummy_bettertransformer_objects.py b/optimum/utils/dummy_bettertransformer_objects.py index e0d982c4263..83a88076563 100644 --- a/optimum/utils/dummy_bettertransformer_objects.py +++ b/optimum/utils/dummy_bettertransformer_objects.py @@ -16,10 +16,10 @@ def __init__(self, *args, **kwargs): class FalconAttention(metaclass=DummyObject): - _backends = ["transformers_432"] + _backends = ["transformers_434"] def __init__(self, *args, **kwargs): - requires_backends(self, ["transformers_432"]) + requires_backends(self, ["transformers_434"]) def _llama_prepare_decoder_attention_mask(*args, **kwargs): diff --git a/optimum/utils/import_utils.py b/optimum/utils/import_utils.py index 773f61576e2..ed310a43ea5 100644 --- a/optimum/utils/import_utils.py +++ b/optimum/utils/import_utils.py @@ -201,6 +201,10 @@ def require_numpy_strictly_lower(version: str, message: str): "transformers_432", (lambda: check_if_transformers_greater("4.32"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.32")), ), + ( + "transformers_434", + (lambda: check_if_transformers_greater("4.34"), "{0} " + TRANSFORMERS_IMPORT_ERROR.format("4.34")), + ), ] ) diff --git a/optimum/utils/input_generators.py b/optimum/utils/input_generators.py index 227c12315d9..29b1a02a514 100644 --- a/optimum/utils/input_generators.py +++ b/optimum/utils/input_generators.py @@ -862,21 +862,44 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int class FalconDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - self.num_kv_heads = 1 - head_dim = self.hidden_size // self.num_attention_heads + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + **kwargs, + ) + self.num_kv_heads = self.num_kv_heads = ( + normalized_config.num_kv_heads + if (normalized_config.new_decoder_architecture or not normalized_config.multi_query) + else 1 + ) + self.head_dim = self.hidden_size // self.num_attention_heads + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): past_key_shape = ( self.batch_size, self.num_kv_heads, self.sequence_length, - head_dim, + self.head_dim, ) past_value_shape = ( self.batch_size, self.num_kv_heads, self.sequence_length, - head_dim, + self.head_dim, ) return [ ( diff --git a/optimum/utils/normalized_config.py b/optimum/utils/normalized_config.py index 14dd17488c4..d874cc56784 100644 --- a/optimum/utils/normalized_config.py +++ b/optimum/utils/normalized_config.py @@ -211,7 +211,9 @@ class NormalizedConfigManager: "blenderbot": BartLikeNormalizedTextConfig, "blenderbot-small": BartLikeNormalizedTextConfig, "bloom": NormalizedTextConfig.with_args(num_layers="n_layer"), - "falcon": NormalizedTextConfig.with_args(num_layers="num_hidden_layers", num_attention_heads="num_kv_heads"), + "falcon": NormalizedTextConfig.with_args( + num_layers="num_hidden_layers", num_attention_heads="num_attention_heads" + ), "camembert": NormalizedTextConfig, "codegen": GPT2LikeNormalizedTextConfig, "cvt": NormalizedVisionConfig, diff --git a/tests/bettertransformer/testing_utils.py b/tests/bettertransformer/testing_utils.py index 113c59f63cb..ed453d06d7a 100644 --- a/tests/bettertransformer/testing_utils.py +++ b/tests/bettertransformer/testing_utils.py @@ -43,7 +43,7 @@ "distilbert": "hf-internal-testing/tiny-random-DistilBertModel", "electra": "hf-internal-testing/tiny-random-ElectraModel", "ernie": "hf-internal-testing/tiny-random-ErnieModel", - "falcon": "Rocketknight1/tiny-random-falcon-7b", + "falcon": "fxmarty/really-tiny-falcon-testing", "fsmt": "hf-internal-testing/tiny-random-FSMTModel", "gpt2": "hf-internal-testing/tiny-random-GPT2Model", # NOTE: this tiny model does not use attention_softmax_in_fp32=True (contrary to e.g. starcoder)