huggingface · yao-matrix · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024 · Nov 19, 2024
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1226,6 +1226,25 @@ def update(
 
         return k_out, v_out
 
+    def crop(self, max_length: int):
+        """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
+        negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search."""
+        # In case it is negative
+        if max_length < 0:
+            max_length = self.get_seq_length() - abs(max_length)
+
+        if self.get_seq_length() <= max_length:
+            return
+
+        begin = max_length
+        end = self.get_seq_length() + 1
+        index = torch.arange(begin, end, device=self.key_cache[0].device)
+
+        self._seen_tokens = max_length
+        for idx in range(len(self.key_cache)):
+            self.key_cache[idx].index_fill_(2, index, 0)
+            self.value_cache[idx].index_fill_(2, index, 0)
+
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states that were seen by the model."""
         # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's

diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
@@ -19,7 +19,7 @@
 import numpy as np
 import torch
 
-from ..cache_utils import DynamicCache
+from ..cache_utils import DynamicCache, StaticCache
 from ..pytorch_utils import isin_mps_friendly
 from .logits_process import LogitsProcessorList, MinLengthLogitsProcessor
 
@@ -176,10 +176,10 @@ def __init__(
                     "Passing `MinLengthLogitsProcessor` when using `assisted_generation is disabled. "
                     "Please pass in `min_length` into `.generate()` instead"
                 )
-
-        # We need to roll back the cache in assisted generation, only DynamicCache is supported
+        # assume cache created while _prepare_cache_for_generation is called
         self.generation_config.cache_implementation = None
 
+
     def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]:
         """
         Fetches the candidates to be tried for the current input.
@@ -696,6 +696,8 @@ def _crop_past_key_values(model, past_key_values, max_length):
                 past_key_values[idx] = past_key_values[idx][:, :, :max_length, :]
     elif isinstance(past_key_values, DynamicCache):
         past_key_values.crop(max_length)
+    elif isinstance(past_key_values, StaticCache):
+        past_key_values.crop(max_length)
     elif past_key_values is not None:
         for idx in range(len(past_key_values)):
             if past_key_values[idx] != ([], []):

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -423,7 +423,7 @@ def prepare_inputs_for_generation(
                     model_input = model_input.clone(memory_format=torch.contiguous_format)
                 model_inputs[model_input_name] = model_input
 
-        # 6. Create 4D attention mask is we are using a `StaticCache` (important for performant compiled forward pass)
+        # 6. Create 4D attention mask if we are using a `StaticCache` (important for performant compiled forward pass)
         if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
             if model_inputs["inputs_embeds"] is not None:
                 batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
@@ -1727,16 +1727,6 @@ def _prepare_cache_for_generation(
             return
 
         # Otherwise we NEED to prepare a cache, based on `generation_config.cache_implementation`
-
-        # TODO(joao): support static caches in assisted generation. assisted generation needs to roll back caches,
-        # which is only supported in dynamic caches atm
-        if assistant_model is not None and generation_config.cache_implementation is not None:
-            logger.warning_once(
-                "An assistant model is provided, using a dynamic cache instead of a cache of type="
-                f"'{generation_config.cache_implementation}'."
-            )
-            generation_config.cache_implementation = None
-
         if generation_config.cache_implementation is not None:
             if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING:
                 if generation_config.cache_implementation == "static" and not self._supports_static_cache:
@@ -1751,6 +1741,14 @@ def _prepare_cache_for_generation(
                     device=device,
                     model_kwargs=model_kwargs,
                 )
+                if assistant_model is not None:
+                    assistant_model._get_cache(
+                        cache_implementation=generation_config.cache_implementation,
+                        batch_size=max(generation_config.num_beams, generation_config.num_return_sequences) * batch_size,
+                        max_cache_len=max_cache_length,
+                        device=device,
+                        model_kwargs=model_kwargs,
+                    )
             elif generation_config.cache_implementation == "quantized":
                 if not self._supports_quantized_cache:
                     raise ValueError(
@@ -2097,6 +2095,7 @@ def generate(
             and not self.config.is_encoder_decoder
         ):
             max_cache_length += inputs_tensor.shape[1]
+
         self._prepare_cache_for_generation(
             generation_config, model_kwargs, assistant_model, batch_size, max_cache_length, device
         )
@@ -2150,7 +2149,7 @@ def generate(
                 raise ValueError("assisted generate is only supported for batch_size = 1")
             if not model_kwargs["use_cache"]:
                 raise ValueError("assisted generate requires `use_cache=True`")
-            if generation_config.cache_implementation in ["static", "hybrid", "sliding_window"]:
+            if generation_config.cache_implementation in ["hybrid", "sliding_window"]:
                 raise ValueError("assisted generate is not supported with Static cache classes`")
             if self._is_stateful:
                 # In assisted generation we need the ability to confirm whether the model would pick certain tokens,