huggingface · ArthurZucker · Feb 8, 2024 · Dec 10, 2023 · Dec 10, 2023 · Dec 10, 2023
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
@@ -1336,7 +1336,7 @@
     _import_structure["activations"] = []
     _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
     _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"]
-    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "SinkCache"]
+    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "SinkCache", "StaticCache"]
     _import_structure["data.datasets"] = [
         "GlueDataset",
         "GlueDataTrainingArguments",
@@ -6070,7 +6070,7 @@
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
-        from .cache_utils import Cache, DynamicCache, SinkCache
+        from .cache_utils import Cache, DynamicCache, SinkCache, StaticCache
         from .data.datasets import (
             GlueDataset,
             GlueDataTrainingArguments,

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
@@ -1,8 +1,12 @@
+from dataclasses import dataclass
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
 
+from .configuration_utils import PretrainedConfig
 
+
+@dataclass
 class Cache:
     """
     Base, abstract class for all caches. The actual data structure is specific to each subclass.
@@ -320,3 +324,74 @@ def reorder_cache(self, beam_idx: torch.LongTensor):
             self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
             device = self.value_cache[layer_idx].device
             self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+
+class StaticCache(Cache):
+    def __init__(self, config: PretrainedConfig, max_batch_size, max_cache_len, device) -> None:
+        super().__init__()
+        self.max_batch_size = max_batch_size
+        self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.dtype = config.torch_dtype if config.torch_dtype is not None else torch.float32
+
+        cache_shape = (max_batch_size, self.num_heads, self.max_cache_len, self.head_dim)
+        self.key_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+        self.value_cache: torch.Tensor = torch.zeros(cache_shape, dtype=self.dtype, device=device)
+
+        self.seen_tokens = 0
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        It is VERY important to index using a tensor, otherwise you introduce a copy to the device.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The `StaticCache` needs to update the attention
+                mask to make sure the unseen tokens are not attended to.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        position_ids = cache_kwargs.get("position_ids")
+
+        k_out = self.key_cache
+        v_out = self.value_cache
+
+        k_out[:, :, position_ids] = key_states
+        v_out[:, :, position_ids] = value_states
+
+        self.seen_tokens += key_states.shape[-2]
+        return k_out, v_out
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states that were seen by the model. A layer index can be optionally passed."""
+        return self.seen_tokens
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return self.max_cache_len
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        device = self.key_cache.device
+        self.key_cache = self.key_cache.index_select(0, beam_idx.to(device))
+        device = self.value_cache.device
+        self.value_cache = self.value_cache.index_select(0, beam_idx.to(device))
+
+    def to_legacy_cache(self):
+        """Dummy function for BC should not be used"""
+        return None
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
@@ -250,6 +250,11 @@ class GenerationConfig(PushToHubMixin):
               reduce by 1
             - `"constant"`: `num_assistant_tokens` stays unchanged during generation
 
+        > Parameters specific to the caching mechanism:
+
+        cache_implementation (`str`, *optional*, default to `"dynamic"`):
+            Cache class that should be used when generating.
+
         > Wild card
 
         generation_kwargs:
@@ -321,6 +326,9 @@ def __init__(self, **kwargs):
         self.num_assistant_tokens = kwargs.pop("num_assistant_tokens", 5)
         self.num_assistant_tokens_schedule = kwargs.pop("num_assistant_tokens_schedule", "heuristic")
 
+        # Cache implementation
+        self.cache_implementation = kwargs.pop("cache_implementation", "dynamic")
+
         # Prompt lookup decoding
         self.prompt_lookup_num_tokens = kwargs.pop("prompt_lookup_num_tokens", None)
 

diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
@@ -24,7 +24,7 @@
 import torch.distributed as dist
 from torch import nn
 
-from ..cache_utils import Cache, DynamicCache
+from ..cache_utils import Cache, DynamicCache, StaticCache
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..models.auto import (
@@ -92,6 +92,10 @@
 if is_accelerate_available():
     from accelerate.hooks import AlignDevicesHook, add_hook_to_module
 
+ALL_COMPILE_CACHE_CLASSES_MAPPING = {
+    "static": StaticCache,
+}
+
 
 @dataclass
 class GenerateDecoderOnlyOutput(ModelOutput):
@@ -1399,6 +1403,19 @@ def generate(
                     "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)"
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
+
+        # if we don't pass `past_key_values` and a cache_implementation is specified
+        if generation_config.cache_implementation in ALL_COMPILE_CACHE_CLASSES_MAPPING and not model_kwargs.get(
+            "past_key_values", False
+        ):
+            cache_cls = ALL_COMPILE_CACHE_CLASSES_MAPPING[generation_config.cache_implementation]
+            if not callable(getattr(self, "_setup_cache", None)):
+                raise ValueError(
+                    "The `generation_config` defines a `cache_implementation` that is not compatible with this model."
+                    " Make sure it has a `_setup_cache` function."
+                )
+            self._setup_cache(cache_cls, max_batch_size=batch_size, max_cache_len=generation_config.max_length)
+
         self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
 
         # 7. determine generation mode