huggingface · yao-matrix · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1237,6 +1237,31 @@ def update(
 
         return k_out, v_out
 
+    def crop(self, max_length: int):
+        """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
+        negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search."""
+        seq_length = self.get_seq_length()
+        # In case it is negative
+        if max_length < 0:
+            max_length = seq_length - abs(max_length)
+
+        if seq_length <= max_length:
+            return
+
+        begin = max_length
+        end = seq_length + 1
+        index = torch.arange(begin, end, device=self.key_cache[0].device)
+
+        self._seen_tokens = max_length
+        for idx in range(len(self.key_cache)):
+            try:
+                self.key_cache[idx].index_fill_(2, index, 0)
+                self.value_cache[idx].index_fill_(2, index, 0)
+            except NotImplementedError:
+                # The operator 'aten::index_fill' is not currently implemented for the MPS device.
+                self.key_cache[idx][:, :, index] = 0
+                self.value_cache[idx][:, :, index] = 0
+
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states that were seen by the model."""
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -19,7 +19,7 @@
 import numpy as np
 import torch
 
-from ..cache_utils import DynamicCache
+from ..cache_utils import Cache
 from ..pytorch_utils import isin_mps_friendly
 from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
 
@@ -177,9 +177,6 @@ def __init__(
                     "Please pass in `min_length` into `.generate()` instead"
                 )
 
-        # We need to roll back the cache in assisted generation, only DynamicCache is supported
-        self.generation_config.cache_implementation = None
-
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
         Fetches the candidates to be tried for the current input.
@@ -229,6 +226,7 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
 
         # 3. Update variables for the next round of candidate generation
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        self.generation_config.cache_implementation = None
 
         # 4. Prepare variables for output
         candidate_logits = torch.stack(assistant_output.scores, dim=1)
@@ -748,7 +746,7 @@ def _crop_past_key_values(model, past_key_values, max_length):
         else:
             for idx in range(len(past_key_values)):
                 past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
-    elif isinstance(past_key_values, DynamicCache):
+    elif isinstance(past_key_values, Cache):
         past_key_values.crop(max_length)
     elif past_key_values is not None:
         for idx in range(len(past_key_values)):

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -430,7 +430,7 @@ def prepare_inputs_for_generation(
                     model_input = model_input.clone(memory_format=torch.contiguous_format)
                 model_inputs[model_input_name] = model_input
 
-        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        # 6. Create 4D attention mask if we are using a `StaticCache` (important for performant compiled forward pass)
         if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
             if model_inputs["inputs_embeds"] is not None:
                 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
@@ -1749,16 +1749,6 @@ def _prepare_cache_for_generation(
             return
 
         # Otherwise we NEED to prepare a cache, based on `generation_config.cache_implementation`
-
-        # TODO(joao): support static caches in assisted generation. assisted generation needs to roll back caches,
-        # which is only supported in dynamic caches atm
-        if assistant_model is not None and generation_config.cache_implementation is not None:
-            logger.warning_once(
-                "An assistant model is provided, using a dynamic cache instead of a cache of type="
-                f"'{generation_config.cache_implementation}'."
-            )
-            generation_config.cache_implementation = None
-
         if generation_config.cache_implementation is not None:
             if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
                 if generation_config.cache_implementation == "static" and not self._supports_static_cache:
@@ -1773,6 +1763,15 @@ def _prepare_cache_for_generation(
                     device=device,
                     model_kwargs=model_kwargs,
                 )
+                if assistant_model is not None:
+                    assistant_model._get_cache(
+                        cache_implementation=generation_config.cache_implementation,
+                        batch_size=max(generation_config.num_beams, generation_config.num_return_sequences)
+                        * batch_size,
+                        max_cache_len=max_cache_length,
+                        device=device,
+                        model_kwargs=model_kwargs,
+                    )
             elif generation_config.cache_implementation == "quantized":
                 if not self._supports_quantized_cache:
                     raise ValueError(
@@ -2119,6 +2118,7 @@ def generate(
             and not self.config.is_encoder_decoder
         ):
             max_cache_length += inputs_tensor.shape[1]
+
         self._prepare_cache_for_generation(
             generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
         )
@@ -2172,7 +2172,7 @@ def generate(
                 raise ValueError("assisted generate is only supported for batch_size = 1")
             if not model_kwargs["use_cache"]:
                 raise ValueError("assisted generate requires `use_cache=True`")
-            if generation_config.cache_implementation in ["static", "hybrid", "sliding_window"]:
+            if generation_config.cache_implementation in ["hybrid", "sliding_window"]:
                 raise ValueError("assisted generate is not supported with Static cache classes`")
             if self._is_stateful:
                 # In assisted generation we need the ability to confirm whether the model would pick certain tokens,

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -2088,8 +2088,9 @@ def test_generate_methods_with_num_logits_to_keep(self):
             without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
+    @parameterized.expand([(None, True), ("static", False)])
     @pytest.mark.generate
-    def test_assisted_decoding_with_num_logits_to_keep(self):
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
                 self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
@@ -2114,6 +2115,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 "assistant_model": assistant_model,
                 "return_dict_in_generate": True,
                 "output_scores": True,
+                "cache_implementation": cache_implementation,
+                "return_legacy_cache": return_legacy_cache,
             }
 
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)

diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
@@ -113,6 +113,12 @@ def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
     def test_assisted_decoding_sample(self):
         pass
 
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest("Gemma2 has HybridCache which is not compatible with assisted decoding StaticCache")
+            pass
+
     @unittest.skip("Gemma2 has HybridCache which is not compatible with dola decoding")
     def test_dola_decoding_sample(self):
         pass

diff --git a/tests/models/jetmoe/test_modeling_jetmoe.py b/tests/models/jetmoe/test_modeling_jetmoe.py
@@ -383,6 +383,15 @@ def test_past_key_values_format(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         self.skipTest(reason="JetMoe flash attention does not support right padding")
 
+    # Copied from tests.models.phimoe.test_modeling_phimoe.PhimoeModelTest.test_assisted_decoding_with_num_logits_to_keep with phimoe->jetmoe, Phimoe->JetMoe
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest(
+                "JetMoe doesn't support StaticCache, please check the following issue -> https://github.com/huggingface/transformers/issues/28981."
+            )
+            pass
+
 
 @require_torch
 class JetMoeIntegrationTest(unittest.TestCase):

diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
@@ -17,6 +17,7 @@
 import unittest
 
 import pytest
+from parameterized import parameterized
 
 from transformers import MixtralConfig, is_torch_available
 from transformers.testing_utils import (
@@ -421,6 +422,15 @@ def test_past_key_values_format(self):
     def test_flash_attn_2_inference_equivalence_right_padding(self):
         self.skipTest(reason="Mixtral flash attention does not support right padding")
 
+    # Copied from tests.models.phimoe.test_modeling_phimoe.PhimoeModelTest.test_assisted_decoding_with_num_logits_to_keep with phimoe->mixtral, Phimoe->Mixtral
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest(
+                "Mixtral doesn't support StaticCache, please check the following issue -> https://github.com/huggingface/transformers/issues/28981."
+            )
+            pass
+
     # Ignore copy
     def test_load_balancing_loss(self):
         r"""

diff --git a/tests/models/moshi/test_modeling_moshi.py b/tests/models/moshi/test_modeling_moshi.py
@@ -362,6 +362,15 @@ def test_disk_offload_safetensors(self):
     def test_save_load(self):
         super().test_save_load()
 
+    # Copied from tests.models.phimoe.test_modeling_phimoe.PhimoeModelTest.test_assisted_decoding_with_num_logits_to_keep with phimoe->moshi, Phimoe->Moshi
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest(
+                "Moshi decoder doesn't support StaticCache, please check the following issue -> https://github.com/huggingface/transformers/issues/28981."
+            )
+            pass
+
 
 class MoshiTester:
     def __init__(

diff --git a/tests/models/phi3/test_modeling_phi3.py b/tests/models/phi3/test_modeling_phi3.py
@@ -490,6 +490,15 @@ def test_model_rope_scaling_short_long_factor(self, scaling_type):
         # Last token generated using long factor
         self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2))
 
+    # Copied from tests.models.phimoe.test_modeling_phimoe.PhimoeModelTest.test_assisted_decoding_with_num_logits_to_keep with phimoe->phi3, Phimoe->Phi3
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest(
+                "Phi3 doesn't support StaticCache, please check the following issue -> https://github.com/huggingface/transformers/issues/28981."
+            )
+            pass
+
 
 @slow
 @require_torch

diff --git a/tests/models/phimoe/test_modeling_phimoe.py b/tests/models/phimoe/test_modeling_phimoe.py
@@ -493,6 +493,14 @@ def test_model_rope_scaling_short_long_factor(self, scaling_type):
         # Last token generated using long factor
         self.assertTrue(torch.allclose(last_token_logits, regenerated_last_token_logits, atol=1e-2, rtol=1e-2))
 
+    @parameterized.expand([(None, True), ("static", False)])
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        if cache_implementation == "static":
+            self.skipTest(
+                "Phimoe doesn't support StaticCache, please check the following issue -> https://github.com/huggingface/transformers/issues/28981."
+            )
+            pass
+
 
 @slow
 @require_torch