huggingface · zucchini-nlp · Apr 4, 2024 · Apr 4, 2024 · Apr 4, 2024 · Apr 5, 2024
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -190,6 +190,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
                 self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
             )
             self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
+            self.assistant_kwargs = _prepare_position_ids(self.assistant_kwargs, new_cur_len)
 
         # 2. Forecast next N tokens using the assistant model.
         assistant_generation_kwargs = {
@@ -423,3 +424,18 @@ def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Di
         token_type_copies = final_token_type.repeat(1, type_length_diff)
         model_kwargs["token_type_ids"] = torch.cat([model_kwargs["token_type_ids"], token_type_copies], dim=-1)
     return model_kwargs
+
+
+def _prepare_position_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
+    position_ids = model_kwargs.get("position_ids")
+    if position_ids is None:
+        return model_kwargs
+
+    # we assume batch_size=1 for assited decoding (needs rework if bs > 1)
+    length_diff = new_length - position_ids[0, -1]
+    if length_diff < 0:
+        position_ids = position_ids[:, :length_diff]
+    elif length_diff > 0:
+        new_position_ids = torch.arange(position_ids[0, -1], new_length, device=position_ids.device).unsqueeze(0)
+        model_kwargs["position_ids"] = torch.cat([model_kwargs["position_ids"], new_position_ids], dim=-1)
+    return model_kwargs
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -43,6 +43,7 @@
     PromptLookupCandidateGenerator,
     _crop_past_key_values,
     _prepare_attention_mask,
+    _prepare_position_ids,
     _prepare_token_type_ids,
 )
 from .configuration_utils import GenerationConfig, GenerationMode
@@ -674,6 +675,10 @@ def _update_model_kwargs_for_generation(
         if "cache_position" in model_kwargs and model_kwargs["cache_position"] is not None:
             model_kwargs["cache_position"] = model_kwargs["cache_position"][-1:] + num_new_tokens
 
+        if "position_ids" in model_kwargs and model_kwargs["position_ids"] is not None:
+            position_ids = model_kwargs["position_ids"]
+            model_kwargs["position_ids"] = torch.cat([position_ids, position_ids[:, -1:] + 1], dim=-1)
+
         return model_kwargs
 
     def _reorder_cache(self, past_key_values, beam_idx):
@@ -4685,6 +4690,7 @@ def _assisted_decoding(
             candidate_kwargs = _prepare_attention_mask(
                 candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
             )
+            candidate_kwargs = _prepare_position_ids(candidate_kwargs, candidate_input_ids.shape[1])
             candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
             if "cache_position" in candidate_kwargs:
                 candidate_kwargs["cache_position"] = torch.cat(

diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
@@ -1248,6 +1248,19 @@ def register_for_auto_class(cls, auto_class="FlaxAutoModel"):
 
         cls._auto_class = auto_class
 
+    def get_position_ids_from_attention_mask(self, attention_mask, batch_size, seq_length):
+        """
+        Tries to infer position ids given attention mask and past kv cache length. All instances when
+        `position_ids=None` should call this method.
+        """
+        if attention_mask is not None:
+            position_ids = jnp.cumsum(attention_mask, axis=-1) - 1
+            position_ids = jnp.where(attention_mask == 0, 1, position_ids)
+            position_ids = position_ids[..., -seq_length:]
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length)[None, :], (batch_size, seq_length))
+        return position_ids
+
 
 # To update the docstring, we need to copy the method, otherwise we change the original docstring.
 FlaxPreTrainedModel.push_to_hub = copy_func(FlaxPreTrainedModel.push_to_hub)

diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
@@ -4368,6 +4368,20 @@ def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask):
 
             logger.warning_once(warn_string)
 
+    def get_position_ids_from_attention_mask(self, attention_mask, past_length, seq_length, device):
+        """
+        Tries to infer position ids given attention mask and past kv cache length. All instances when
+        `position_ids=None` should call this method.
+        """
+        if attention_mask is not None:
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids = position_ids.masked_fill(attention_mask == 0, 1)
+            position_ids = position_ids[..., -seq_length:].view(-1, seq_length)
+        else:
+            position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+        return position_ids
+
     @property
     def _is_quantized_training_enabled(self):
         warnings.warn(

diff --git a/src/transformers/models/codegen/modeling_codegen.py b/src/transformers/models/codegen/modeling_codegen.py
@@ -455,7 +455,8 @@ def forward(
         else:
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
-        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
 
         if token_type_ids is not None:
             token_type_ids = token_type_ids.view(-1, input_shape[-1])
@@ -467,8 +468,9 @@ def forward(
             past_length = past_key_values[0][0].size(-2)
 
         if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0)
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=inputs_embeds.shape[1], device=inputs_embeds.device
+            )
 
         # Attention mask.
         if attention_mask is not None:
@@ -496,9 +498,6 @@ def forward(
         # head_mask has shape n_layer x batch x num_attention_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.n_layer)
 
-        if inputs_embeds is None:
-            inputs_embeds = self.wte(input_ids)
-
         hidden_states = inputs_embeds
 
         if token_type_ids is not None:
@@ -597,6 +596,7 @@ def set_output_embeddings(self, new_embeddings):
     def prepare_inputs_for_generation(self, input_ids, inputs_embeds=None, past_key_values=None, **kwargs):
         token_type_ids = kwargs.get("token_type_ids", None)
         # Omit tokens covered by past_key_values
+        past_length = 0
         if past_key_values:
             past_length = past_key_values[0][0].shape[2]
 
@@ -614,12 +614,16 @@ def prepare_inputs_for_generation(self, input_ids, inputs_embeds=None, past_key_
         attention_mask = kwargs.get("attention_mask", None)
         position_ids = kwargs.get("position_ids", None)
 
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        seq_length = (
+            inputs_embeds.shape[1] if inputs_embeds is not None and past_key_values is None else input_ids.shape[1]
+        )
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=seq_length, device=device
+            )
+        else:
+            position_ids = position_ids[:, -seq_length:]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:

diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py
@@ -909,7 +909,9 @@ def forward(
             )
 
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_seen_tokens, seq_length=inputs_embeds.shape[1], device=inputs_embeds.device
+            )
 
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position, past_seen_tokens)
 
@@ -1227,12 +1229,16 @@ def prepare_inputs_for_generation(
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        seq_length = (
+            inputs_embeds.shape[1] if inputs_embeds is not None and past_key_values is None else input_ids.shape[1]
+        )
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=seq_length, device=device
+            )
+        else:
+            position_ids = position_ids[:, -seq_length:]
 
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
         if inputs_embeds is not None and past_key_values is None:

diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py
@@ -412,9 +412,11 @@ def forward(
             past_key_values = tuple([None] * len(self.h))
         else:
             past_length = past_key_values[0][0].size(-2)
+
         if position_ids is None:
-            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
-            position_ids = position_ids.unsqueeze(0)
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=input_shape[1], device=device
+            )
 
         # Attention mask.
         if attention_mask is not None:
@@ -525,6 +527,7 @@ def set_output_embeddings(self, new_embeddings):
 
     def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
         # only last tokens for inputs_ids if past is defined in kwargs
+        past_length = 0
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[2]
 
@@ -537,6 +540,16 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cac
 
             input_ids = input_ids[:, remove_prefix_length:]
 
+        attention_mask = kwargs.get("attention_mask", None)
+        position_ids = kwargs.get("position_ids", None)
+
+        if position_ids is None:
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=input_ids.shape[1], device=input_ids.device
+            )
+        else:
+            position_ids = position_ids[:, -input_ids.shape[1] :]
+
         return {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
 
     @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)

diff --git a/src/transformers/models/ctrl/modeling_tf_ctrl.py b/src/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -344,8 +344,15 @@ def call(
         else:
             past_length = shape_list(past_key_values[0][0])[-2]
         if position_ids is None:
-            position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0)
-            position_ids = tf.tile(position_ids, [input_shape[0], 1])
+            if attention_mask is not None:
+                position_ids = tf.cumsum(tf.cast(attention_mask, tf.int64), axis=-1) - 1
+                # create ones tensor to match dtypes, otherwise we get errors
+                ones_tensor = tf.ones_like(position_ids, dtype=tf.int64)
+                position_ids = tf.where(attention_mask == 0, ones_tensor, position_ids)
+                position_ids = position_ids[..., -input_shape[-1] :]
+                position_ids = tf.reshape(position_ids, (-1, input_shape[-1]))
+            else:
+                position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
-                position_ids = tf.cumsum(tf.cast(attention_mask, tf.int64), axis=-1) - 1
-                # create ones tensor to match dtypes, otherwise we get errors
-                ones_tensor = tf.ones_like(position_ids, dtype=tf.int64)
-                position_ids = tf.where(attention_mask == 0, ones_tensor, position_ids)
-                position_ids = position_ids[..., -input_shape[-1] :]
-                position_ids = tf.reshape(position_ids, (-1, input_shape[-1]))
-            else:
-                position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
+                position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
-                position_ids = tf.cumsum(tf.cast(attention_mask, tf.int64), axis=-1) - 1
-                # create ones tensor to match dtypes, otherwise we get errors
-                ones_tensor = tf.ones_like(position_ids, dtype=tf.int64)
-                position_ids = tf.where(attention_mask == 0, ones_tensor, position_ids)
-                position_ids = position_ids[..., -input_shape[-1] :]
-                position_ids = tf.reshape(position_ids, (-1, input_shape[-1]))
-            else:
-                position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length), axis=0)
+                position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
 
         # Attention mask.
         if attention_mask is not None:
@@ -702,7 +709,9 @@ def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=
         attention_mask = kwargs.get("attention_mask", None)
 
         if attention_mask is not None and position_ids is None:
-            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            position_ids = tf.cumsum(tf.cast(attention_mask, tf.int64), axis=-1) - 1
+            ones_tensor = tf.ones_like(position_ids, dtype=tf.int64)
+            position_ids = tf.where(attention_mask == 0, ones_tensor, position_ids)
             if past_key_values:
                 position_ids = tf.expand_dims(position_ids[:, -1], -1)
 

diff --git a/src/transformers/models/dbrx/modeling_dbrx.py b/src/transformers/models/dbrx/modeling_dbrx.py
@@ -1145,7 +1145,10 @@ def forward(
             )
 
         if position_ids is None:
-            position_ids = cache_position.unsqueeze(0)
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_seen_tokens, seq_length=inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
         causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
 
         # embed positions
@@ -1470,12 +1473,16 @@ def prepare_inputs_for_generation(
                 attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        seq_length = (
+            inputs_embeds.shape[1] if inputs_embeds is not None and past_key_values is None else input_ids.shape[1]
+        )
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = self.get_position_ids_from_attention_mask(
+                attention_mask, past_length, seq_length=seq_length, device=device
+            )
+        else:
+            position_ids = position_ids[:, -seq_length:]
 
         if self.generation_config.cache_implementation == "static":
             # generation with static cache

diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py
@@ -593,6 +593,7 @@ def forward(
             argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
         }
 
+        print(self.encoder)
-        print(self.encoder)
-        print(self.encoder)
         if encoder_outputs is None:
             encoder_outputs = self.encoder(
                 input_ids=input_ids,

diff --git a/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_flax_encoder_decoder.py
@@ -737,12 +737,10 @@ def prepare_inputs_for_generation(
         # Thus we can create a single static attention_mask here, which is more efficient for compilation
         extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
         if decoder_attention_mask is not None:
-            decoder_position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
             extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
-        else:
-            decoder_position_ids = jnp.broadcast_to(
-                jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)
-            )
+        decoder_position_ids = self.get_position_ids_from_attention_mask(
+            decoder_attention_mask, batch_size, seq_length
+        )
 
         return {
             "past_key_values": past_key_values,

diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
@@ -1077,11 +1077,9 @@ def forward(
         else:
             alibi = None
             if position_ids is None:
-                device = input_ids.device if input_ids is not None else inputs_embeds.device
-                position_ids = torch.arange(
-                    past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+                position_ids = self.get_position_ids_from_attention_mask(
+                    attention_mask, past_key_values_length, seq_length=seq_length, device=inputs_embeds.device
                 )
-                position_ids = position_ids.unsqueeze(0)
 
         if self._use_flash_attention_2:
             # 2d mask is passed through the layers
@@ -1215,6 +1213,7 @@ def prepare_inputs_for_generation(
         inputs_embeds: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> dict:
+        past_length = 0
         if past_key_values is not None:
             past_length = past_key_values[0][0].shape[2]
 
@@ -1228,12 +1227,17 @@ def prepare_inputs_for_generation(
             input_ids = input_ids[:, remove_prefix_length:]
 
         # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE.
-        if not self.transformer.use_alibi and attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -input_ids.shape[1] :]
+        if not self.transformer.use_alibi:
+            seq_length = (
+                inputs_embeds.shape[1] if inputs_embeds is not None and past_key_values is None else input_ids.shape[1]
+            )
+            if position_ids is None:
+                device = input_ids.device if input_ids is not None else inputs_embeds.device
+                position_ids = self.get_position_ids_from_attention_mask(
+                    attention_mask, past_length, seq_length=seq_length, device=device
+                )
+            else:
+                position_ids = position_ids[:, -seq_length:]
 
         if inputs_embeds is not None and past_key_values is None:
             model_inputs = {"inputs_embeds": inputs_embeds}