huggingface · yao-matrix · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1215,6 +1215,31 @@ def update(
 
         return k_out, v_out
 
+    def crop(self, max_length: int):
+        """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
+        negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search."""
+        seq_length = self.get_seq_length()
+        # In case it is negative
+        if max_length < 0:
+            max_length = seq_length - abs(max_length)
+
+        if seq_length <= max_length:
+            return
+
+        begin = max_length
+        end = seq_length + 1
+        index = torch.arange(begin, end, device=self.key_cache[0].device)
+
+        self._seen_tokens = max_length
+        for idx in range(len(self.key_cache)):
+            try:
+                self.key_cache[idx].index_fill_(2, index, 0)
+                self.value_cache[idx].index_fill_(2, index, 0)
+            except NotImplementedError:
+                # The operator 'aten::index_fill' is not currently implemented for the MPS device.
+                self.key_cache[idx][:, :, index] = 0
+                self.value_cache[idx][:, :, index] = 0
+
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states that were seen by the model."""
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -25,7 +25,7 @@
 if is_sklearn_available():
     from sklearn.metrics import roc_curve
 
-from ..cache_utils import DynamicCache
+from ..cache_utils import Cache
 from ..pytorch_utils import isin_mps_friendly
 from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
 
@@ -183,9 +183,6 @@ def __init__(
                     "Please pass in `min_length` into `.generate()` instead"
                 )
 
-        # We need to roll back the cache in assisted generation, only DynamicCache is supported
-        self.generation_config.cache_implementation = None
-
         if (
             is_sklearn_available()
             and self.assistant_model.generation_config.assistant_confidence_threshold
@@ -212,11 +209,13 @@ def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor,
         min_new_tokens, max_new_tokens = self._calculate_new_tokens(input_ids)
         if max_new_tokens == 0:
             return input_ids, None
+
         # Update past key values and masks
         self._update_past_and_masks(input_ids)
         # Generate candidates
         generation_args = self._prepare_generation_args(input_ids, min_new_tokens, max_new_tokens)
         candidate_ids, candidate_logits = self._generate_candidates(generation_args)
+
         return candidate_ids, candidate_logits
 
     def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
@@ -312,6 +311,7 @@ def _generate_candidates(self, generation_args: Dict) -> Tuple[torch.LongTensor,
         """Generate candidate sequences using the assistant model."""
         assistant_output = self.assistant_model.generate(**generation_args, **self.assistant_kwargs)
         self.assistant_kwargs["past_key_values"] = assistant_output.past_key_values
+        self.generation_config.cache_implementation = None
         if (
             is_sklearn_available()
             and self.assistant_model.generation_config.assistant_confidence_threshold
@@ -801,7 +801,7 @@ def _crop_past_key_values(model, past_key_values, max_length):
         else:
             for idx in range(len(past_key_values)):
                 past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
-    elif isinstance(past_key_values, DynamicCache):
+    elif isinstance(past_key_values, Cache):
         past_key_values.crop(max_length)
     elif past_key_values is not None:
         for idx in range(len(past_key_values)):

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -429,7 +429,7 @@ def prepare_inputs_for_generation(
                     model_input = model_input.clone(memory_format=torch.contiguous_format)
                 model_inputs[model_input_name] = model_input
 
-        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        # 6. Create 4D attention mask if we are using a `StaticCache` (important for performant compiled forward pass)
         if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
             if model_inputs["inputs_embeds"] is not None:
                 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
@@ -1749,16 +1749,6 @@ def _prepare_cache_for_generation(
             return
 
         # Otherwise we NEED to prepare a cache, based on `generation_config.cache_implementation`
-
-        # TODO(joao): support static caches in assisted generation. assisted generation needs to roll back caches,
-        # which is only supported in dynamic caches atm
-        if assistant_model is not None and generation_config.cache_implementation is not None:
-            logger.warning_once(
-                "An assistant model is provided, using a dynamic cache instead of a cache of type="
-                f"'{generation_config.cache_implementation}'."
-            )
-            generation_config.cache_implementation = None
-
         if generation_config.cache_implementation is not None:
             if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
                 if generation_config.cache_implementation == "static" and not self._supports_static_cache:
@@ -1773,6 +1763,15 @@ def _prepare_cache_for_generation(
                     device=device,
                     model_kwargs=model_kwargs,
                 )
+                if assistant_model is not None:
+                    assistant_model._get_cache(
+                        cache_implementation=generation_config.cache_implementation,
+                        batch_size=max(generation_config.num_beams, generation_config.num_return_sequences)
+                        * batch_size,
+                        max_cache_len=max_cache_length,
+                        device=device,
+                        model_kwargs=model_kwargs,
+                    )
             elif generation_config.cache_implementation == "quantized":
                 if not self._supports_quantized_cache:
                     raise ValueError(
@@ -2119,6 +2118,7 @@ def generate(
             and not self.config.is_encoder_decoder
         ):
             max_cache_length += inputs_tensor.shape[1]
+
         self._prepare_cache_for_generation(
             generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
         )
@@ -2172,8 +2172,8 @@ def generate(
                 raise ValueError("assisted generate is only supported for batch_size = 1")
             if not model_kwargs["use_cache"]:
                 raise ValueError("assisted generate requires `use_cache=True`")
-            if generation_config.cache_implementation in ["static", "hybrid", "sliding_window"]:
-                raise ValueError("assisted generate is not supported with Static cache classes`")
+            if generation_config.cache_implementation in ["hybrid", "sliding_window"]:
+                raise ValueError("assisted generate is not supported with hybrid & sliding_window cache classes`")
             if self._is_stateful:
                 # In assisted generation we need the ability to confirm whether the model would pick certain tokens,
                 # which is not possible with stateful models (they can't reset to a previous subset of generated text)

diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
@@ -2072,6 +2072,86 @@ def test_generate_compile(self, _, end_to_end):
             for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs):
                 self._check_similar_generate_outputs(dynamic_result, compiled_result)
 
+    @parameterized.expand(
+        [
+            ("forward_only", False),  # TODO (@joao): a few models failing. After fixed, this should not be "@slow"
+            ("end_to_end", True),  # TODO (@joao): end-to-end compilation is broken with torch 2.5+, explore and fix
+        ]
+    )
+    @pytest.mark.generate
+    @require_torch_gpu
+    @slow
+    def test_assisted_decoding_compile(self, _, end_to_end):
+        """
+        Tests that `.generate` is compatible with torch.compile without graph breaks, keeping the same results. Tests
+        end-to-end compilation and forward pass compilation only.
+        ⚠️ Runs two sequential generations to ensure the cache doesn't get stuck after the first compiled run! ⚠️
+        """
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_static_cache:
+                self.skipTest("This model doesn't support static cache")
+
+            if model_class._is_stateful:
+                self.skipTest(reason="Stateful models don't support assisted generation")
+
+            # TODO (joao) -- fix and enable me :)
+            if end_to_end and any(model_name in model_class.__name__.lower() for model_name in ["whisper"]):
+                self.skipTest("whisper model end-to-end generate compile not yet supported")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            # TODO (joao) -- fix and enable me :)
+            if end_to_end and config.is_encoder_decoder:
+                self.skipTest("Encoder-decoder model end-to-end generate compile not yet supported")
+
+            if not hasattr(config, "use_cache"):
+                self.skipTest(reason=f"{model_class.__name__} doesn't support caching")
+
+            model = model_class(config).to(torch_device)
+            model.eval()  # otherwise `self.training` is `True` -- this flag is used at attn mask creation time
+
+            input_ids = inputs_dict["input_ids"].to(torch_device)
+            # assisted decoding only supports batch size 1, so divide and conquer
+            batch_size = input_ids.shape[0]
+            input_ids_sets = [torch.unsqueeze(input_ids[0, :], 0)]
+            for i in range(1, batch_size):
+                input_ids_sets.append(torch.unsqueeze(input_ids[i, :], 0))
+                self.assertTrue(input_ids_sets[i].shape == input_ids_sets[0].shape)
+
+            generation_kwargs = {
+                "do_sample": False,
+                "max_new_tokens": 10,
+                "return_dict_in_generate": True,
+                "output_scores": True,
+                "return_legacy_cache": False,
+            }
+
+            # end-to-end works best with dynamic cache, forward compilation works best with static cache
+            if not end_to_end:
+                generation_kwargs["cache_implementation"] = "static"
+
+            # get eager + dynamic cache results for future comparison
+            dynamic_outputs = []
+            for model_inputs in input_ids_sets:
+                dynamic_outputs.append(model.generate(model_inputs, assistant_model=model, **generation_kwargs))
+
+            # get compiled results
+            generation_config = copy.deepcopy(model.generation_config)
+            generation_config.update(**generation_kwargs)
+            torch.compiler.reset()
+            if end_to_end:
+                model.generate = torch.compile(model.generate, fullgraph=True, mode="reduce-overhead")
+            else:
+                model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
+
+            compiled_outputs = []
+            for model_inputs in input_ids_sets:
+                compiled_outputs.append(
+                    model.generate(model_inputs, assistant_model=model, generation_config=generation_config)
+                )
+
+            for dynamic_result, compiled_result in zip(dynamic_outputs, compiled_outputs):
+                self._check_similar_generate_outputs(dynamic_result, compiled_result)
+
     @pytest.mark.generate
     def test_generate_methods_with_num_logits_to_keep(self):
         for model_class in self.all_generative_model_classes:
@@ -2097,13 +2177,16 @@ def test_generate_methods_with_num_logits_to_keep(self):
             without_all_logits = model.generate(**inputs_dict, **generation_kwargs)
             self.assertEqual(with_all_logits.tolist(), without_all_logits.tolist())
 
+    @parameterized.expand([("static", False), (None, True)])
     @pytest.mark.generate
-    def test_assisted_decoding_with_num_logits_to_keep(self):
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
         for model_class in self.all_generative_model_classes:
             if "num_logits_to_keep" not in set(inspect.signature(model_class.forward).parameters.keys()):
                 self.skipTest(reason="This model does not support `num_logits_to_keep` argument.")
             if model_class._is_stateful:
                 self.skipTest(reason="Stateful models don't support assisted generation")
+            if cache_implementation == "static" and not model_class._supports_static_cache:
+                self.skipTest(reason="This model does not support `cache_implementation=static`.")
 
             config, inputs_dict = self.prepare_config_and_inputs_for_generate(batch_size=1)
             # NOTE: assisted generation only works with cache on at the moment.
@@ -2123,6 +2206,8 @@ def test_assisted_decoding_with_num_logits_to_keep(self):
                 "assistant_model": assistant_model,
                 "return_dict_in_generate": True,
                 "output_scores": True,
+                "cache_implementation": cache_implementation,
+                "return_legacy_cache": return_legacy_cache,
             }
 
             # Setting num_logits_to_keep at 0 keeps all logits (old behavior)

diff --git a/tests/models/gemma2/test_modeling_gemma2.py b/tests/models/gemma2/test_modeling_gemma2.py
@@ -113,6 +113,11 @@ def test_prompt_lookup_decoding_matches_greedy_search(self, assistant_type):
     def test_assisted_decoding_sample(self):
         pass
 
+    @parameterized.expand([("static", False)])
+    @unittest.skip("Gemma2 has HybridCache which is not compatible with assisted decoding StaticCache")
+    def test_assisted_decoding_with_num_logits_to_keep(self, cache_implementation, return_legacy_cache):
+        pass
+
     @unittest.skip("Gemma2 has HybridCache which is not compatible with dola decoding")
     def test_dola_decoding_sample(self):
         pass