diff --git a/.circleci/config.yml b/.circleci/config.yml
index cdd97f4fcecaff..6558dc1454b273 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -142,6 +142,7 @@ jobs:
- run: python utils/custom_init_isort.py --check_only
- run: python utils/sort_auto_mappings.py --check_only
- run: python utils/check_doc_toc.py
+ - run: python utils/check_docstrings.py --check_all
check_repository_consistency:
working_directory: ~/transformers
@@ -190,4 +191,4 @@ workflows:
- check_circleci_user
- check_code_quality
- check_repository_consistency
- - fetch_all_tests
\ No newline at end of file
+ - fetch_all_tests
diff --git a/.github/workflows/self-pr-slow-ci.yml b/.github/workflows/self-pr-slow-ci.yml
index 8225e5b6aa7b1d..2287b5e3f31587 100644
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@@ -4,7 +4,7 @@ on:
pull_request:
paths:
- "src/transformers/models/*/modeling_*.py"
- - "tests/models/*/test_*.py"
+ - "tests/**/test_*.py"
concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
index 7dde5462240424..29a11e9354dbb1 100644
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@@ -10,20 +10,9 @@ jobs:
trufflehog:
runs-on: ubuntu-latest
steps:
- - shell: bash
- run: |
- if [ "${{ github.event_name }}" == "push" ]; then
- echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
- echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV
- fi
- if [ "${{ github.event_name }}" == "pull_request" ]; then
- echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
- echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV
- fi
- - name: Checkout code
- uses: actions/checkout@v4
- with:
- ref: ${{env.branch}}
- fetch-depth: ${{env.depth}}
- - name: Secret Scanning
- uses: trufflesecurity/trufflehog@main
+ - name: Checkout code
+ uses: actions/checkout@v4
+ with:
+ fetch-depth: 0
+ - name: Secret Scanning
+ uses: trufflesecurity/trufflehog@main
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index f96bcd9e9d2875..4d62a44ab250d5 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -132,7 +132,7 @@ You will need basic `git` proficiency to contribute to
manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
Git](https://git-scm.com/book/en/v2) is a very good reference.
-You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
+You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
1. Fork the [repository](https://github.com/huggingface/transformers) by
clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
@@ -341,12 +341,12 @@ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_ne
RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
```
-Like the slow tests, there are other environment variables available which not enabled by default during testing:
+Like the slow tests, there are other environment variables available which are not enabled by default during testing:
- `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
- `RUN_PT_FLAX_CROSS_TESTS`: Enables tests for PyTorch + Flax integration.
- `RUN_PT_TF_CROSS_TESTS`: Enables tests for TensorFlow + PyTorch integration.
-More environment variables and additional information can be found in the [testing_utils.py](src/transformers/testing_utils.py).
+More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
`pytest`-specific features in the test suite itself.
diff --git a/Makefile b/Makefile
index f9b2a8c9a7c620..cfa40b7bd6ee6e 100644
--- a/Makefile
+++ b/Makefile
@@ -56,6 +56,7 @@ quality:
python utils/custom_init_isort.py --check_only
python utils/sort_auto_mappings.py --check_only
python utils/check_doc_toc.py
+ python utils/check_docstrings.py --check_all
# Format source code automatically and check is there are any problems left that need manual fixing
diff --git a/docker/consistency.dockerfile b/docker/consistency.dockerfile
index c59e48bdd89d51..73436f8afca29f 100644
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@@ -8,7 +8,7 @@ RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
# tensorflow pin matching setup.py
RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,speech,vision,testing]"
RUN git lfs install
RUN pip uninstall -y transformers
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index 378a65d1bf37b8..9c5e3c91415745 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
# The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
# to be used as arguments for docker build (so far).
-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
# (not always a valid torch version)
ARG INTEL_TORCH_EXT='2.3.0'
# Example: `cu102`, `cu113`, etc.
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index c9f77a78ce9b83..2c1f153eef275e 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -11,7 +11,7 @@ ARG REF=main
RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
# If set to nothing, will install the latest version
-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
ARG TORCH_VISION=''
ARG TORCH_AUDIO=''
# Example: `cu102`, `cu113`, etc.
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index cc6ff752c7701e..93f2c96d2d9df0 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -92,11 +92,15 @@
title: Visual Question Answering
- local: tasks/text-to-speech
title: Text to speech
+ - local: tasks/image_text_to_text
+ title: Image-text-to-text
title: Multimodal
- isExpanded: false
sections:
- local: generation_strategies
title: Customize the generation strategy
+ - local: kv_cache
+ title: Best Practices for Generation with Cache
title: Generation
- isExpanded: false
sections:
@@ -155,6 +159,8 @@
title: EETQ
- local: quantization/hqq
title: HQQ
+ - local: quantization/fbgemm_fp8
+ title: FBGEMM_FP8
- local: quantization/optimum
title: Optimum
- local: quantization/contribute
@@ -326,8 +332,6 @@
title: CamemBERT
- local: model_doc/canine
title: CANINE
- - local: model_doc/chameleon
- title: chameleon
- local: model_doc/codegen
title: CodeGen
- local: model_doc/code_llama
@@ -760,6 +764,8 @@
title: BridgeTower
- local: model_doc/bros
title: BROS
+ - local: model_doc/chameleon
+ title: Chameleon
- local: model_doc/chinese_clip
title: Chinese-CLIP
- local: model_doc/clip
diff --git a/docs/source/en/agents.md b/docs/source/en/agents.md
index d1c550f5d32ea8..f335cb678faa60 100644
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@@ -509,3 +509,54 @@ agent = ReactCodeAgent(tools=[search_tool])
agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
```
+
+## Gradio interface
+
+You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example:
+
+```py
+import gradio as gr
+from transformers import (
+ load_tool,
+ ReactCodeAgent,
+ HfEngine,
+ stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+ messages = []
+ messages.append(gr.ChatMessage(role="user", content=task))
+ yield messages
+ for msg in stream_to_gradio(agent, task):
+ messages.append(msg)
+ yield messages + [
+ gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+ ]
+ yield messages
+
+
+with gr.Blocks() as demo:
+ text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+ submit = gr.Button("Run illustrator agent!")
+ chatbot = gr.Chatbot(
+ label="Agent",
+ type="messages",
+ avatar_images=(
+ None,
+ "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+ ),
+ )
+ submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+ demo.launch()
+```
\ No newline at end of file
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index d840caaf660520..c4069dd1afc706 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the
>>> from transformers import AutoTokenizer
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")
->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}"
```
@@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template`
We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
trying to put it all in a single template where possible!
-### What are "default" templates?
-
-Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards
-compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a
-model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline`
-class and methods like `apply_chat_template` will use the class template instead. You can find out what the default
-template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute.
-
-This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when
-the class template is appropriate for your model, we strongly recommend overriding the default template by
-setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured
-for chat.
-
-Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be
-removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that
-still depend on them!
-
### What template should I use?
When setting the template for a model that's already been trained for chat, you should ensure that the template
diff --git a/docs/source/en/conversations.md b/docs/source/en/conversations.md
index 9336503ad7cb8c..a48c046b4949d7 100644
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
print("Tokenized inputs:\n", inputs)
# 4: Generate text from the model
-outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.)
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
print("Generated tokens:\n", outputs)
# 5: Decode the output back to a string
diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index 68430de643f17b..3a9392ddd07d9b 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -174,43 +174,6 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
```
-## KV Cache Quantization
-
-The `generate()` method supports caching keys and values to enhance efficiency and avoid re-computations. However the key and value
-cache can occupy a large portion of memory, becoming a bottleneck for long-context generation, especially for Large Language Models.
-Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
-
-KV Cache quantization in `transformers` is largely inspired by the paper [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache]
-(https://arxiv.org/abs/2402.02750) and currently supports `quanto` and `HQQ` as backends. For more information on the inner workings see the paper.
-
-To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
-Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`QuantizedCacheConfig`] class.
-One has to indicate which quantization backend to use in the [`QuantizedCacheConfig`], the default is `quanto`.
-
-
-
-Cache quantization can be detrimental if the context length is short and there is enough GPU VRAM available to run without cache quantization.
-
-
-
-
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
->>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. It's a great way to express myself and rel
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
-```
-
## Watermarking
The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index da7ea25e54b6b0..1172e32fd0cc5a 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -386,11 +386,24 @@ A [`Constraint`] can be used to force the generation to include specific tokens
- get_seq_length
- reorder_cache
+[[autodoc]] OffloadedCache
+ - update
+ - prefetch_layer
+ - evict_previous_layer
+
[[autodoc]] StaticCache
- update
- get_seq_length
- reset
+[[autodoc]] HybridCache
+ - update
+ - reset
+
+[[autodoc]] SlidingWindowCache
+ - update
+ - reset
+
[[autodoc]] EncoderDecoderCache
- get_seq_length
- to_legacy_cache
@@ -398,6 +411,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens
- reset
- reorder_cache
+[[autodoc]] MambaCache
+ - update_conv_state
+ - update_ssm_state
+ - reset
+
## Watermark Utils
[[autodoc]] WatermarkDetector
diff --git a/docs/source/en/kv_cache.md b/docs/source/en/kv_cache.md
new file mode 100644
index 00000000000000..c0ccc49d41e683
--- /dev/null
+++ b/docs/source/en/kv_cache.md
@@ -0,0 +1,346 @@
+
+
+# Best Practices for Generation with Cache
+
+Efficient caching is crucial for optimizing the performance of models in various generative tasks,
+including text generation, translation, summarization and other transformer-based applications.
+Effective caching helps reduce computation time and improve response rates, especially in real-time or resource-intensive applications.
+
+Transformers support various caching methods, leveraging "Cache" classes to abstract and manage the caching logic.
+This document outlines best practices for using these classes to maximize performance and efficiency.
+Check out all the available `Cache` classes in the [API documentation](./internal/generation_utils.md).
+
+## What is Cache and why we should care?
+
+Imagine you’re having a conversation with someone, and instead of remembering what was said previously, you have to start from scratch every time you respond. This would be slow and inefficient, right? In the world of Transformer models, a similar concept applies, and that's where Caching keys and values come into play. From now on, I'll refer to the concept as KV Cache.
+
+KV cache is needed to optimize the generation in autoregressive models, where the model predicts text token by token. This process can be slow since the model can generate only one token at a time, and each new prediction is dependent on the previous context. That means, to predict token number 1000 in the generation, you need information from the previous 999 tokens, which comes in the form of some matrix multiplications across the representations of those tokens. But to predict token number 1001, you also need the same information from the first 999 tokens, plus additional information from token number 1000. That is where key-value cache is used to optimize the sequential generation process by storing previous calculations to reuse in subsequent tokens, so they don't need to be computed again.
+
+More concretely, key-value cache acts as a memory bank for these generative models, where the model stores key-value pairs derived from self-attention layers for previously processed tokens. By storing this information, the model can avoid redundant computations and instead retrieve keys and values of previous tokens from the cache.
+
+
+ For the Curious Minds Who Like to Dive Deep
+
+ ### Under the Hood: How Cache Object Works in Attention Mechanism
+
+ When utilizing a cache object in the input, the Attention module performs several critical steps to integrate past and present information seamlessly.
+
+ The Attention module concatenates the current key-values with the past key-values stored in the cache. This results in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`. Essentially, the past and current key-values are combined to compute attention scores, ensuring that the model considers both previous context and new input. The concatenated key-values are used to compute the attention scores resulting in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`.
+
+ Therefore, when iteratively calling `forward()` instead of the `generate()` method, it’s crucial to ensure that the attention mask shape matches the combined length of past and current key-values. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is usually handled internally when you call `generate()` method. If you want to implement your own generation loop with Cache classes, take this into consideration and prepare the attention mask to hold values to current and past tokens.
+
+
+
+ One important concept you need to know when writing your own generation loop, is `cache_position`. In case you want to reuse an already filled Cache object by calling `forward()`, you have to pass in a valid `cache_position` which will indicate the positions of inputs in the sequence. Note that `cache_position` is not affected by padding, and always adds one more position for each token. For example, if key/value cache contains 10 tokens (no matter how many of it is a pad token), the cache position for the next token should be `torch.tensor([10])`.
+
+
+
+
+ See an example below for how to implement your own generation loop.
+
+ ```python
+ >>> import torch
+ >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+ >>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+ >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+ >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+ >>> past_key_values = DynamicCache()
+ >>> messages = [{"role": "user", "content": "Hello, what's your name."}]
+ >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+
+ >>> generated_ids = inputs.input_ids
+ >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+ >>> max_new_tokens = 10
+
+ >>> for _ in range(max_new_tokens):
+ ... outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
+ ... # Greedily sample one next token
+ ... next_token_ids = outputs.logits[:, -1:].argmax(-1)
+ ... generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
+ ...
+ ... # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token
+ ... # and expanding attn mask for the new token, as explained above
+ ... attention_mask = inputs["attention_mask"]
+ ... attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+ ... inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
+ ... cache_position = cache_position[-1:] + 1 # add one more position for the next token
+
+ >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
+ "[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
+ ```
+
+
+
+
+
+## Generate with Cache
+
+In 🤗 Transformers, we support various Cache types to optimize the performance across different models and tasks. By default, all models generate with caching,
+with the [`~DynamicCache`] class being the default cache for most models. It allows us to dynamically grow cache size, by saving more and more keys and values as we generate. If for some reason you don't want to use caches, you can pass `use_cache=False` into the `generate()` method.
+
+Refer to the table below to see the difference between cache types and choose the one that suits best for your use-case.
+
+| Cache Type | Memory Efficient | Supports torch.compile() | Initialization Recommended | Latency | Long Context Generation |
+|---------------------|------------------|--------------------------|----------------------------|----------|--------------------------|
+| Dynamic Cache | No | No | No | Mid | No |
+| Static Cache | No | Yes | Yes | High | No |
+| Quantized Cache | Yes | No | No | Low | Yes |
+| Offloaded Cache | Yes | No | No | Low | No |
+| Sliding Window Cache| No | Yes | Yes | High | No |
+| Sink Cache | Yes | No | Yes | Mid | Yes |
+
+
+These cache classes can be set with a `cache_implementation` argument when generating. To learn about the available options for the cache_implementation flag, please refer to the [API Documentation](./main_classes/text_generation.md#transformers.GenerationConfig). Now, let's explore each cache type in detail and see how to use them. Note that the below examples are for decoder-only Tranformer-based models. We also support ["Model-Specific Cache"] classes for models such as Mamba or Jamba, keep reading for more details.
+
+### Quantized Cache
+
+The key and value cache can occupy a large portion of memory, becoming a [bottleneck for long-context generation](https://huggingface.co/blog/llama31#inference-memory-requirements), especially for Large Language Models.
+Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
+
+KV Cache quantization in `transformers` is largely inspired by the paper ["KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache"](https://arxiv.org/abs/2402.02750) and currently supports [`~QuantoQuantizedCache`] and [`~HQQQuantizedCache`] classes. For more information on the inner workings see the paper.
+
+To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
+Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class.
+One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`.
+
+
+
+Cache quantization can be detrimental in terms of latency if the context length is short and there is enough GPU VRAM available to run without cache quantization. It is recommended to seek balance between memory efficiency and latency.
+
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
+```
+
+## OffloadedCache
+
+Similarly to KV cache quantization, [`~OffloadedCache`] strategy aims to reduce GPU VRAM usage.
+It does so by moving the KV cache for most layers to the CPU.
+As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU.
+At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU.
+Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation.
+Thus, it can serve as a drop-in replacement or a fallback for it.
+
+Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.)
+you may notice a small degradation in generation throughput compared to the default KV cache implementation.
+
+To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config` or directky to the `generate()` call.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+```
+
+
+
+Cache offloading requires a GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors.
+
+
+
+The example below shows how KV cache offloading can be used as a fallback strategy.
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> def resilient_generate(model, *args, **kwargs):
+... oom = False
+... try:
+... return model.generate(*args, **kwargs)
+... except torch.cuda.OutOfMemoryError as e:
+... print(e)
+... print("retrying with cache_implementation='offloaded'")
+... oom = True
+... if oom:
+... torch.cuda.empty_cache()
+... kwargs["cache_implementation"] = "offloaded"
+... return model.generate(*args, **kwargs)
+...
+...
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> prompt = ["okay "*1000 + "Fun fact: The most"]
+>>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+>>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+>>> out = resilient_generate(model, **inputs, **beams)
+>>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
+```
+
+On a GPU with 50 GB of RAM, running this code will print
+```
+CUDA out of memory. Tried to allocate 4.83 GiB. GPU
+retrying with cache_implementation='offloaded'
+```
+before successfully generating 40 beams.
+
+
+
+### Static Cache
+
+Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates
+a specific maximum size for the keys and values, allowing you to generate up to the maximum length without having to modify cache size. Check the below usage example.
+
+For more examples with Static Cache and JIT compilation, take a look at [StaticCache & torchcompile](./llm_optims.md#static-kv-cache-and-torchcompile)
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+>>> # simply pass the cache implementation="static"
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
+```
+
+### Sliding Window Cache
+
+As the name suggests, this cache type implements a sliding window over previous keys and values, retaining only the last `sliding_window` tokens. It should be used with models like Mistral that support sliding window attention. Additionally, similar to Static Cache, this one is JIT-friendly and can be used with the same compile tecniques as Static Cache.
+
+Note that you can use this cache only for models that support sliding window, e.g. Mistral models.
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
+
+>>> # can be used by passing in cache implementation
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I"
+```
+
+### Sink Cache
+
+Sink Cache was introduced in ["Efficient Streaming Language Models with Attention Sinks"](https://arxiv.org/abs/2309.17453). It allows you to generate long sequences of text ("infinite length" according to the paper) without any fine-tuning. That is achieved by smart handling of previous keys and values, specifically it retains a few initial tokens from the sequence, called "sink tokens". This is based on the observation that these initial tokens attract a significant portion of attention scores during the generation process. Tokens that come after "sink tokens" are discarded on a sliding windowed basis, keeping only the latest `window_size` tokens. By keeping these initial tokens as "attention sinks," the model maintains stable performance even when dealing with very long texts, thus discarding most of the previous knowledge.
+
+Unlike other cache classes, this one can't be used directly by indicating a `cache_implementation`. You have to initialize the Cache before calling on `generate()` as follows.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
+
+>>> # get our cache, specify number of sink tokens and window size
+>>> # Note that window size already includes sink tokens, so has to be larger
+>>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
+```
+
+### Encoder-Decoder Cache
+
+The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper.md) models but we will be adding more models soon.
+
+In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you.
+
+
+### Model-specific Cache Classes
+
+Some models require storing previous keys, values, or states in a specific way, and the above cache classes cannot be used. For such cases, we have several specialized cache classes that are designed for specific models. These models only accept their own dedicated cache classes and do not support using any other cache types. Some examples include [`~HybridCache`] for [Gemma2](./model_doc/gemma2.md) series models or [`~MambaCache`] for [Mamba](./model_doc/mamba.md) architecture models.
+
+
+## Iterative Generation with Cache
+
+We have seen how to use each of the cache types when generating. What if you want to use cache in iterative generation setting, for example in applications like chatbots, where interactions involve multiple turns and continuous back-and-forth exchanges. Iterative generation with cache allows these systems to handle ongoing conversations effectively without reprocessing the entire context at each step. But there are some tips that you should know before you start implementing:
+
+The general format when doing iterative generation is as below. First you have to initialize an empty cache of the type you want, and you can start feeding in new prompts iteratively. Keeping track of dialogues history and formatting can be done with chat templates, read more on that in [chat_templating](./chat_templating.md)
+
+In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length.
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer,AutoModelForCausalLM
+>>> from transformers.cache_utils import (
+>>> DynamicCache,
+>>> SinkCache,
+>>> StaticCache,
+>>> SlidingWindowCache,
+>>> QuantoQuantizedCache,
+>>> QuantizedCacheConfig,
+>>> )
+
+>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
+>>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+>>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
+
+>>> past_key_values = DynamicCache()
+>>> max_cache_length = past_key_values.get_max_length()
+
+>>> messages = []
+>>> for prompt in user_prompts:
+... messages.append({"role": "user", "content": prompt})
+... inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
+... if isinstance(past_key_values, SinkCache):
+... inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()}
+...
+... input_length = inputs["input_ids"].shape[1]
+...
+... outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values)
+... completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True)
+... messages.append({"role": "assistant", "content": completion})
+
+print(messages)
+[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}]
+```
+
+
+## Re-use Cache to continue generation
+
+Sometimes you would want to fist fill-in cache object with key/values for certain prefix prompt and re-use it several times to generate different sequences from it. We are working hard on adding this feature to 🤗 Transformers and will update this section soon.
diff --git a/docs/source/en/llm_optims.md b/docs/source/en/llm_optims.md
index 5e49f0e1ebd3ab..8e7e9c54d42a42 100644
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@@ -18,59 +18,109 @@ Basic inference is slow because LLMs have to be called repeatedly to generate th
This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference.
> [!TIP]
-> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes more optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
+> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
-## Static kv-cache and torch.compile
+## Static kv-cache and `torch.compile`
During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time.
-To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
+To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
-The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up.
+The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.
> [!WARNING]
-> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
+> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
-For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model.
+There are three flavors of static kv-cache usage, depending on the complexity of your task:
+1. Basic usage: simply set a flag in `generation_config` (recommended);
+2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop;
+3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you.
+
+Select the correct tab below for further instructions on each of these flavors.
+
+> [!TIP]
+> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend!
+
+
+
+
+For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to:
+1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static";
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
+
+And that's it!
```py
from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :)
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained(
- "google/gemma-2b", device_map="auto"
-)
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.generation_config.cache_implementation = "static"
+
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
```
-There are two ways you can configure the model to use a static kv-cache. For a 7B model on an A100, both methods get a 4x speed up in the forward pass. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. If you're using the [`~GenerationMixin.generate`] method, the speed up is ~3x. The forward pass (which still gets 4x speed up) is only a part of the whole [`~GenerationMixin.generate`] code.
+Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following:
+1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation;
+2. The first couple of calls of the compiled function are slower, as the function is being compiled.
-
-
+> [!WARNING]
+> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab.
+
+
+
-Access the model's `generation_config` attribute and set the `cache_implementation` to "static".
+A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache.
```py
-model.generation_config.cache_implementation = "static"
-```
+from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :)
-Call torch.compile on the model to compile the forward pass with the static kv-cache.
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
-```py
-compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
input_text = "The theory of special relativity states "
input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+prompt_length = input_ids.input_ids.shape[1]
+model.generation_config.max_new_tokens = 16
+
+past_key_values = StaticCache(
+ config=model.config,
+ max_batch_size=1,
+ # If you plan to reuse the cache, make sure the cache length is large enough for all cases
+ max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
+ device=model.device,
+ dtype=model.dtype
+)
+outputs = model.generate(**input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2']
-outputs = compiled_model.generate(**input_ids)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
+# pass in the generated text and the same cache object to continue generation from where it left off. Optionally, in a
+# multi-turn conversation, append the new user input to the generated text.
+new_input_ids = outputs
+outputs = model.generate(new_input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2. The speed of light is constant in all inertial reference frames. 3.']
```
-Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation.
-
-
-
+> [!TIP]
+> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls
-A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache.
+If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens.
```py
from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
@@ -102,12 +152,9 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
return new_token
```
-There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method:
-
+There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
-
-2. Call torch.compile on the model to compile the forward pass with the static kv-cache.
-
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.
```py
@@ -142,8 +189,34 @@ text
'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p']
```
-> [!TIP]
-> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method
+
+
+
+Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false" # To prevent long warnings :)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
+```
+
+As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach:
+1. Compilation is much slower;
+2. All parameterization of `generate` must be done through `generation_config`;
+3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first;
+4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected).
diff --git a/docs/source/en/main_classes/agent.md b/docs/source/en/main_classes/agent.md
index 8376fb36486c7c..444003615ba4f1 100644
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@@ -72,6 +72,10 @@ We provide two types of agents, based on the main [`Agent`] class:
[[autodoc]] launch_gradio_demo
+### stream_to_gradio
+
+[[autodoc]] stream_to_gradio
+
### ToolCollection
[[autodoc]] ToolCollection
diff --git a/docs/source/en/main_classes/backbones.md b/docs/source/en/main_classes/backbones.md
index efea7eb32a84c8..5f1fc1dcbe1f20 100644
--- a/docs/source/en/main_classes/backbones.md
+++ b/docs/source/en/main_classes/backbones.md
@@ -25,11 +25,11 @@ A backbone is a model used for feature extraction for higher level computer visi
Backbones are supported for the following models:
-* [BEiT](..model_doc/beit)
+* [BEiT](../model_doc/beit)
* [BiT](../model_doc/bit)
-* [ConvNet](../model_doc/convnext)
+* [ConvNext](../model_doc/convnext)
* [ConvNextV2](../model_doc/convnextv2)
-* [DiNAT](..model_doc/dinat)
+* [DiNAT](../model_doc/dinat)
* [DINOV2](../model_doc/dinov2)
* [FocalNet](../model_doc/focalnet)
* [MaskFormer](../model_doc/maskformer)
diff --git a/docs/source/en/main_classes/data_collator.md b/docs/source/en/main_classes/data_collator.md
index 74e653dd1185e9..e704bb747fe6e0 100644
--- a/docs/source/en/main_classes/data_collator.md
+++ b/docs/source/en/main_classes/data_collator.md
@@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n
- numpy_mask_tokens
- tf_mask_tokens
- torch_mask_tokens
+
+## DataCollatorWithFlattening
+
+[[autodoc]] data.data_collator.DataCollatorWithFlattening
+
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index f1e2acdcfe4809..fc5808415cbe5f 100755
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -56,3 +56,8 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
## HqqConfig
[[autodoc]] HqqConfig
+
+## FbgemmFp8Config
+
+[[autodoc]] FbgemmFp8Config
+
diff --git a/docs/source/en/model_doc/chameleon.md b/docs/source/en/model_doc/chameleon.md
index e2a0012ba97f2c..323b83813160b0 100644
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@@ -34,13 +34,13 @@ being competitive with models such as Mixtral 8x7B and Gemini-Pro, and performs
generation, all in a single model. It also matches or exceeds the performance of much larger models,
including Gemini Pro and GPT-4V, according to human judgments on a new long-form mixed-modal
generation evaluation, where either the prompt or outputs contain mixed sequences of both images and
-text. Chameleon marks a significant step forward in a unified modeling of full multimodal documents*
+text. Chameleon marks a significant step forward in unified modeling of full multimodal documents*
- Chameleon incorporates a vector quantizer module to transform images into discrete tokens. That also enables image geenration using an auto-regressive transformer. Taken from the original paper.
+ Chameleon incorporates a vector quantizer module to transform images into discrete tokens. That also enables image generation using an auto-regressive transformer. Taken from the original paper.
This model was contributed by [joaogante](https://huggingface.co/joaogante) and [RaushanTurganbay](https://huggingface.co/RaushanTurganbay).
The original code can be found [here](https://github.com/facebookresearch/chameleon).
@@ -55,27 +55,28 @@ The original code can be found [here](https://github.com/facebookresearch/chamel
- Chameleon generates in chat format which means that the generated text will always be the "assistant's turn". You can enable a text completion generation by passing `return_for_text_completion=True` when calling the processor.
> [!NOTE]
-> Chameleon implementation in Transformers uses a special image token to indicate where to merge image embeddings. For special image token we didn't add a new one but used one of the reserved tokens: ``.
+> Chameleon implementation in Transformers uses a special image token to indicate where to merge image embeddings. For special image token we didn't add a new one but used one of the reserved tokens: ``. You have to add `` to your prompt in the place where the image should be embedded for correct generation.
## Usage example
### Single image inference
-Here's how to load the model and perform inference in half-precision (`torch.float16`):
+Chameleon is a gated model so make sure to have access and login to Hugging Face Hub using a token.
+Here's how to load the model and perform inference in half-precision (`torch.bfloat16`):
```python
-from transformers import ChameleonProcessor, ChameleonForCausalLM
+from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
import torch
from PIL import Image
import requests
-processor = ChameleonProcessor.from_pretrained("meta-chameleon")
-model = ChameleonForCausalLM.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto")
+processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
# prepare image and text prompt
-url = "https://bjiujitsu.com/wp-content/uploads/2021/01/jiu_jitsu_belt_white_1.jpg"
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
-prompt = "What color is the belt in this image?"
+prompt = "What do you see in this image?"
inputs = processor(prompt, image, return_tensors="pt").to(model.device)
@@ -89,13 +90,14 @@ print(processor.decode(output[0], skip_special_tokens=True))
Chameleon can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
```python
-from transformers import ChameleonProcessor, ChameleonForCausalLM
+from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
import torch
from PIL import Image
import requests
-processor = ChameleonProcessor.from_pretrained("meta-chameleon")
-model = ChameleonForCausalLM.from_pretrained("meta-chameleon", torch_dtype=torch.float16, device_map="auto")
+processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", torch_dtype=torch.bfloat16, device_map="cuda")
# Get three different images
url = "https://www.ilankelman.org/stopsigns/australia.jpg"
@@ -115,7 +117,7 @@ prompts = [
# We can simply feed images in the order they have to be used in the text prompt
# Each "" token uses one image leaving the next for the subsequent "" tokens
-inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
+inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=50)
@@ -129,16 +131,16 @@ processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokeniza
The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes` and make sure to have access to a CUDA compatible GPU device. Simply change the snippet above with:
```python
-from transformers import ChameleonForCausalLM, BitsAndBytesConfig
+from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
# specify how to quantize the model
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
- bnb_4bit_compute_dtype=torch.float16,
+ bnb_4bit_compute_dtype=torch.bfloat16,
)
-model = ChameleonForCausalLM.from_pretrained("meta-chameleon", quantization_config=quantization_config, device_map="auto")
+model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", quantization_config=quantization_config, device_map="cuda")
```
### Use Flash-Attention 2 and SDPA to further speed-up generation
@@ -146,11 +148,12 @@ model = ChameleonForCausalLM.from_pretrained("meta-chameleon", quantization_conf
The models supports both, Flash-Attention 2 and PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) which can be enables for optimization. SDPA is the default options when you load the model, If you want to switch for Flash Attention 2, first make sure to install flash-attn. Refer to the [original repository](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
```python
-from transformers import ChameleonForCausalLM
+from transformers import ChameleonForConditionalGeneration
-model = ChameleonForCausalLM.from_pretrained(
+model_id = "facebook/chameleon-7b"
+model = ChameleonForConditionalGeneration.from_pretrained(
model_id,
- torch_dtype=torch.float16,
+ torch_dtype=torch.bfloat16,
low_cpu_mem_usage=True,
attn_implementation="flash_attention_2"
).to(0)
@@ -183,7 +186,7 @@ model = ChameleonForCausalLM.from_pretrained(
[[autodoc]] ChameleonModel
- forward
-## ChameleonForCausalLM
+## ChameleonForConditionalGeneration
-[[autodoc]] ChameleonForCausalLM
+[[autodoc]] ChameleonForConditionalGeneration
- forward
diff --git a/docs/source/en/model_doc/clip.md b/docs/source/en/model_doc/clip.md
index 692ea083717c42..f0829f484aaa51 100644
--- a/docs/source/en/model_doc/clip.md
+++ b/docs/source/en/model_doc/clip.md
@@ -79,6 +79,123 @@ encode the text and prepare the images. The following example shows how to get t
>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
```
+
+### Combining CLIP and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
+
+
+
+For small batch sizes, you might notice a slowdown in your model when using flash attention. Refer to the section [Expected speedups with Flash Attention and SDPA](#Expected-speedups-with-Flash-Attention-and-SDPA) below and select an appropriate attention implementation.
+
+
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> device = "cuda"
+>>> torch_dtype = torch.float16
+
+>>> model = CLIPModel.from_pretrained(
+... "openai/clip-vit-base-patch32",
+... attn_implementation="flash_attention_2",
+... device_map=device,
+... torch_dtype=torch_dtype,
+... )
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+>>> inputs.to(device)
+
+>>> with torch.no_grad():
+... with torch.autocast(device):
+... outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
+>>> print(probs)
+tensor([[0.9946, 0.0052]], device='cuda:0', dtype=torch.float16)
+```
+
+
+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```python
+from transformers import CLIPModel
+
+model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32", torch_dtype=torch.float16, attn_implementation="sdpa")
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+### Expected speedups with Flash Attention and SDPA
+
+On a local benchmark (NVIDIA A10G, PyTorch 2.3.1+cu121) with `float16`, we saw the following speedups during inference for `"openai/clip-vit-large-patch14"` checkpoint ([code](https://gist.github.com/qubvel/ac691a54e54f9fae8144275f866a7ff8)):
+
+#### CLIPTextModel
+
+| Num text labels | Eager (s/iter) | FA2 (s/iter) | FA2 speedup | SDPA (s/iter) | SDPA speedup |
+|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+| 4 | 0.009 | 0.012 | 0.737 | 0.007 | 1.269 |
+| 16 | 0.009 | 0.014 | 0.659 | 0.008 | 1.187 |
+| 32 | 0.018 | 0.021 | 0.862 | 0.016 | 1.142 |
+| 64 | 0.034 | 0.034 | 1.001 | 0.03 | 1.163 |
+| 128 | 0.063 | 0.058 | 1.09 | 0.054 | 1.174 |
+
+![clip_text_model_viz_3](https://github.com/user-attachments/assets/e9826b43-4e66-4f4c-952b-af4d90bd38eb)
+
+#### CLIPVisionModel
+
+| Image batch size | Eager (s/iter) | FA2 (s/iter) | FA2 speedup | SDPA (s/iter) | SDPA speedup |
+|-------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+| 1 | 0.016 | 0.013 | 1.247 | 0.012 | 1.318 |
+| 4 | 0.025 | 0.021 | 1.198 | 0.021 | 1.202 |
+| 16 | 0.093 | 0.075 | 1.234 | 0.075 | 1.24 |
+| 32 | 0.181 | 0.147 | 1.237 | 0.146 | 1.241 |
+
+![clip_image_model_viz_3](https://github.com/user-attachments/assets/50a36206-e3b9-4adc-ac8e-926b8b071d63)
+
+#### CLIPModel
+
+| Image batch size | Num text labels | Eager (s/iter) | FA2 (s/iter) | FA2 speedup | SDPA (s/iter) | SDPA speedup |
+|-------------------:|------------------:|-----------------:|---------------:|--------------:|----------------:|---------------:|
+| 1 | 4 | 0.025 | 0.026 | 0.954 | 0.02 | 1.217 |
+| 1 | 16 | 0.026 | 0.028 | 0.918 | 0.02 | 1.287 |
+| 1 | 64 | 0.042 | 0.046 | 0.906 | 0.036 | 1.167 |
+| 4 | 4 | 0.028 | 0.033 | 0.849 | 0.024 | 1.189 |
+| 4 | 16 | 0.034 | 0.035 | 0.955 | 0.029 | 1.169 |
+| 4 | 64 | 0.059 | 0.055 | 1.072 | 0.05 | 1.179 |
+| 16 | 4 | 0.096 | 0.088 | 1.091 | 0.078 | 1.234 |
+| 16 | 16 | 0.102 | 0.09 | 1.129 | 0.083 | 1.224 |
+| 16 | 64 | 0.127 | 0.11 | 1.157 | 0.105 | 1.218 |
+| 32 | 4 | 0.185 | 0.159 | 1.157 | 0.149 | 1.238 |
+| 32 | 16 | 0.19 | 0.162 | 1.177 | 0.154 | 1.233 |
+| 32 | 64 | 0.216 | 0.181 | 1.19 | 0.176 | 1.228 |
+
## Resources
A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP.
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
index dca94786773d1d..e8f7c08cbfc44b 100644
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@@ -57,7 +57,7 @@ print((last_hidden_states - traced_outputs[0]).abs().max())
## Resources
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT.
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DINOv2.
- Demo notebooks for DINOv2 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DINOv2). 🌎
diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md
index d258f492abf8b5..a6da554f8d5053 100644
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@@ -41,33 +41,40 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
Here's how to use the model for zero-shot object detection:
```python
-import requests
-
-import torch
-from PIL import Image
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection,
-
-model_id = "IDEA-Research/grounding-dino-tiny"
-
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
-
-image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(image_url, stream=True).raw)
-# Check for cats and remote controls
-text = "a cat. a remote control."
-
-inputs = processor(images=image, text=text, return_tensors="pt").to(device)
-with torch.no_grad():
- outputs = model(**inputs)
-
-results = processor.post_process_grounded_object_detection(
- outputs,
- inputs.input_ids,
- box_threshold=0.4,
- text_threshold=0.3,
- target_sizes=[image.size[::-1]]
-)
+>>> import requests
+
+>>> import torch
+>>> from PIL import Image
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+>>> model_id = "IDEA-Research/grounding-dino-tiny"
+>>> device = "cuda"
+
+>>> processor = AutoProcessor.from_pretrained(model_id)
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+
+>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(image_url, stream=True).raw)
+>>> # Check for cats and remote controls
+>>> text = "a cat. a remote control."
+
+>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+>>> with torch.no_grad():
+... outputs = model(**inputs)
+
+>>> results = processor.post_process_grounded_object_detection(
+... outputs,
+... inputs.input_ids,
+... box_threshold=0.4,
+... text_threshold=0.3,
+... target_sizes=[image.size[::-1]]
+... )
+>>> print(results)
+[{'boxes': tensor([[344.6959, 23.1090, 637.1833, 374.2751],
+ [ 12.2666, 51.9145, 316.8582, 472.4392],
+ [ 38.5742, 70.0015, 176.7838, 118.1806]], device='cuda:0'),
+ 'labels': ['a cat', 'a cat', 'a remote control'],
+ 'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}]
```
## Grounded SAM
diff --git a/docs/source/en/model_doc/hiera.md b/docs/source/en/model_doc/hiera.md
index 24bf1639fe1400..9bd2816e7a99fc 100644
--- a/docs/source/en/model_doc/hiera.md
+++ b/docs/source/en/model_doc/hiera.md
@@ -26,8 +26,22 @@ The abstract from the paper is the following:
*Modern hierarchical vision transformers have added several vision-specific components in the pursuit of supervised classification performance. While these components lead to effective accuracies and attractive FLOP counts, the added complexity actually makes these transformers slower than their vanilla ViT counterparts. In this paper, we argue that this additional bulk is unnecessary. By pretraining with a strong visual pretext task (MAE), we can strip out all the bells-and-whistles from a state-of-the-art multi-stage vision transformer without losing accuracy. In the process, we create Hiera, an extremely simple hierarchical vision transformer that is more accurate than previous models while being significantly faster both at inference and during training. We evaluate Hiera on a variety of tasks for image and video recognition. Our code and models are available at https://github.com/facebookresearch/hiera.*
+
+
+ Hiera architecture. Taken from the original paper.
+
This model was a joint contibution by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [namangarg110](https://huggingface.co/namangarg110). The original code can be found [here] (https://github.com/facebookresearch/hiera).
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Hiera. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+
+- [`HieraForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
## HieraConfig
[[autodoc]] HieraConfig
diff --git a/docs/source/en/model_doc/llava-next-video.md b/docs/source/en/model_doc/llava-next-video.md
index 88e41efc29c87c..48e50f950621e8 100644
--- a/docs/source/en/model_doc/llava-next-video.md
+++ b/docs/source/en/model_doc/llava-next-video.md
@@ -43,6 +43,13 @@ The original code can be found [here](https://github.com/LLaVA-VL/LLaVA-NeXT/tre
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
+
+
+- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
+
+
+
+
- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use tokenizer's `apply_chat_template` to format your prompts correctly. Below is an example of how to do that.
We will use [LLaVA-NeXT-Video-7B-hf](https://huggingface.co/llava-hf/LLaVA-NeXT-Video-7B-hf) and a conversation history of videos and images. Each content field has to be a list of dicts, as follows:
diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
index 43eaa41d5d7140..a7e4b4da7f3c5a 100644
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@@ -40,7 +40,42 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
-- For better results, we recommend users to prompt the model with the correct prompt format. Below is a list of prompt formats accepted by each llava checkpoint:
+- For better results, we recommend users to use the processor's `apply_chat_template()` method to format your prompt correctly. For that you need to construct a conversation history, passing in a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities, as follows:
+
+```python
+from transformers import AutoProcessor
+
+processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+conversation = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What’s shown in this image?"},
+ ],
+ },
+ {
+ "role": "assistant",
+ "content": [{"type": "text", "text": "This image shows a red stop sign."},]
+ },
+ {
+
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Describe the image in more details."},
+ ],
+ },
+]
+
+text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
+print(text_prompt)
+>>> "USER: \nUSER: Describe the image in more details. ASSISTANT:"
+```
+
+- If you want to construct a chat prompt yourself, below is a list of prompt formats accepted by each llava checkpoint:
[llava-interleave models](https://huggingface.co/collections/llava-hf/llava-interleave-668e19a97da0036aad4a2f19) requires the following format:
```bash
@@ -64,6 +99,7 @@ For multiple turns conversation:
"USER: \n ASSISTANT: USER: ASSISTANT: USER: ASSISTANT:"
```
+
### Using Flash Attention 2
Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
diff --git a/docs/source/en/model_doc/llava_next.md b/docs/source/en/model_doc/llava_next.md
index a4a1419ee00ac8..d0558be76467a2 100644
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@@ -46,26 +46,79 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
-- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. Below, we list the correct prompt formats to use for the text prompt "What is shown in this image?":
+
-[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
+- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
+
+
+
+
+- Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.
+We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:
+
+```python
+from transformers import LlavaNextProcessor
+
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+
+conversation = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What’s shown in this image?"},
+ ],
+ },
+ {
+ "role": "assistant",
+ "content": [{"type": "text", "text": "This image shows a red stop sign."},]
+ },
+ {
+
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Describe the image in more details."},
+ ],
+ },
+]
+
+text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
+
+# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
+print(text_prompt)
+>>> "[INST] \nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
+```
+
+- If you want to construct a chat prompt yourself, below is a list of possible formats
+.
+[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
```bash
"[INST] \nWhat is shown in this image? [/INST]"
```
[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
-
```bash
"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: \nWhat is shown in this image? ASSISTANT:"
```
[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
-
```bash
"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
```
+[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
+
+```bash
+"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+```
+
+[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:
+
+```bash
+"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
+```
+
## Usage example
### Single image inference
@@ -86,8 +139,17 @@ model.to("cuda:0")
# prepare image and text prompt, using the appropriate prompt template
url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
image = Image.open(requests.get(url, stream=True).raw)
-prompt = "[INST] \nWhat is shown in this image? [/INST]"
+conversation = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What is shown in this image?"},
+ ],
+ },
+]
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(prompt, image, return_tensors="pt").to("cuda:0")
# autoregressively complete prompt
@@ -120,15 +182,47 @@ image_cats = Image.open(requests.get(url, stream=True).raw)
url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
image_snowman = Image.open(requests.get(url, stream=True).raw)
-# Prepare a batched prompt, where the first one is a multi-turn conversation and the second is not
-prompt = [
- "[INST] \nWhat is shown in this image? [/INST] There is a red stop sign in the image. [INST] \nWhat about this image? How many cats do you see [/INST]",
- "[INST] \nWhat is shown in this image? [/INST]"
+# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
+conversation_1 = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What is shown in this image?"},
+ ],
+ },
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "There is a red stop sign in the image."},
+ ],
+ },
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What about this image? How many cats do you see?"},
+ ],
+ },
]
+conversation_2 = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "image"},
+ {"type": "text", "text": "What is shown in this image?"},
+ ],
+ },
+]
+
+prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
+prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
+prompts = [prompt_1, prompt_2]
+
# We can simply feed images in the order they have to be used in the text prompt
# Each "" token uses one image leaving the next for the subsequent "" tokens
-inputs = processor(text=prompt, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
+inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(model.device)
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=30)
diff --git a/docs/source/en/model_doc/marian.md b/docs/source/en/model_doc/marian.md
index 8078ea1427c952..d8ebec8ffb0ad2 100644
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@@ -105,7 +105,7 @@ from huggingface_hub import list_models
model_list = list_models()
org = "Helsinki-NLP"
-model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+model_ids = [x.id for x in model_list if x.id.startswith(org)]
suffix = [x.split("/")[1] for x in model_ids]
old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
```
diff --git a/docs/source/en/model_doc/qwen2.md b/docs/source/en/model_doc/qwen2.md
index ac0e25e02c35f9..16815f2fc1f3cd 100644
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
## Overview
-Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc.
+Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc.
### Model Details
@@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different
## Usage tips
-`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
+`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
-In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
```python
>>> from transformers import AutoModelForCausalLM, AutoTokenizer
>>> device = "cuda" # the device to load the model onto
->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto")
->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
+>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")
>>> prompt = "Give me a short introduction to large language model."
diff --git a/docs/source/en/model_doc/roberta.md b/docs/source/en/model_doc/roberta.md
index 364b5b37e5f3f0..2a1843d8885abe 100644
--- a/docs/source/en/model_doc/roberta.md
+++ b/docs/source/en/model_doc/roberta.md
@@ -51,19 +51,19 @@ This model was contributed by [julien-c](https://huggingface.co/julien-c). The o
## Usage tips
-- This implementation is the same as [`BertModel`] with a tiny embeddings tweak as well as a setup
- for Roberta pretrained models.
-- RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
+- This implementation is the same as [`BertModel`] with a minor tweak to the embeddings, as well as a setup
+ for RoBERTa pretrained models.
+- RoBERTa has the same architecture as BERT but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
different pretraining scheme.
-- RoBERTa doesn't have `token_type_ids`, you don't need to indicate which token belongs to which segment. Just
- separate your segments with the separation token `tokenizer.sep_token` (or ``)
-- Same as BERT with better pretraining tricks:
-
- * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
- * together to reach 512 tokens (so the sentences are in an order than may span several documents)
- * train with larger batches
- * use BPE with bytes as a subunit and not characters (because of unicode characters)
-- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to this page for usage examples.
+- RoBERTa doesn't have `token_type_ids`, so you don't need to indicate which token belongs to which segment. Just
+ separate your segments with the separation token `tokenizer.sep_token` (or ``).
+- RoBERTa is similar to BERT but with better pretraining techniques:
+
+ * Dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all.
+ * Sentence packing: Sentences are packed together to reach 512 tokens (so the sentences are in an order that may span several documents).
+ * Larger batches: Training uses larger batches.
+ * Byte-level BPE vocabulary: Uses BPE with bytes as a subunit instead of characters, accommodating Unicode characters.
+- [CamemBERT](camembert) is a wrapper around RoBERTa. Refer to its model page for usage examples.
## Resources
diff --git a/docs/source/en/model_doc/video_llava.md b/docs/source/en/model_doc/video_llava.md
index 307c55bb2cef63..f098e82a177670 100644
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@@ -98,7 +98,7 @@ indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)
# For better results, we recommend to prompt the model in the following format
-prompt = "USER: