From 801894e08c53a80c2b4377ce102a44dccfda8928 Mon Sep 17 00:00:00 2001
From: amitportnoy <113588658+amitportnoy@users.noreply.github.com>
Date: Thu, 2 May 2024 16:30:21 +0300
Subject: [PATCH 1/5] phi3 chat_template does not support system role (#30606)

* phi3 chat_template does not support system role

* fix doc test error
---
 docs/source/en/model_doc/phi3.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
diff --git a/docs/source/en/model_doc/phi3.md b/docs/source/en/model_doc/phi3.md
index 4f6d7acad178e5..76d94008137eb2 100644
--- a/docs/source/en/model_doc/phi3.md
+++ b/docs/source/en/model_doc/phi3.md
@@ -9,7 +9,7 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.
 
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
 
 -->
@@ -51,13 +51,16 @@ Phi-3 has been integrated in the development version (4.40.0.dev) of `transforme
 >>> model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
 
->>> messages = [{"role": "system", "content": "You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user."},{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
+>>> messages = [{"role": "user", "content": "Can you provide ways to eat combinations of bananas and dragonfruits?"}]
 >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
 
 >>> outputs = model.generate(inputs, max_new_tokens=32)
 >>> text = tokenizer.batch_decode(outputs)[0]
 >>> print(text)
-<s><|system|> You are a helpful digital assistant. Please provide safe, ethical and accurate information to the user.<|end|><|user|> Can you provide ways to eat combinations of bananas and dragonfruits?<|end|><|assistant|> Absolutely! Bananas and dragonfruits are both delicious fruits that can be combined in various ways to create tasty and nutrit
+<s><|user|> 
+Can you provide ways to eat combinations of bananas and dragonfruits?<|end|> 
+<|assistant|> 
+Certainly! Bananas and dragonfruits can be combined in various delicious ways. Here are some ideas for eating combinations of bananas and
 ```
 
 ## Phi3Config

From aa55ff44a259913f0a5254a542b1d7af4b85e037 Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 2 May 2024 14:42:25 +0100
Subject: [PATCH 2/5] Docs: fix `generate`-related rendering issues (#30600)

* does this work?

* like this?

* fix the other generate links

* missing these
---
 docs/source/en/generation_strategies.md       |  2 +-
 docs/source/en/tasks/language_modeling.md     |  2 +-
 docs/source/en/tasks/summarization.md         |  2 +-
 docs/source/en/tasks/translation.md           |  2 +-
 docs/source/ja/generation_strategies.md       |  4 +--
 docs/source/ja/tasks/language_modeling.md     |  2 +-
 docs/source/ja/tasks/summarization.md         |  2 +-
 docs/source/ja/tasks/translation.md           |  2 +-
 docs/source/ko/generation_strategies.md       |  2 +-
 docs/source/ko/tasks/language_modeling.md     |  2 +-
 docs/source/ko/tasks/summarization.md         | 32 +++++++++----------
 docs/source/ko/tasks/translation.md           |  4 +--
 .../generation/configuration_utils.py         | 10 ++----
 13 files changed, 31 insertions(+), 37 deletions(-)

diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md
index c1d88c90b6f194..3d4829c3e37f18 100644
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@@ -21,7 +21,7 @@ more. It also plays a role in a variety of mixed-modality applications that have
 and vision-to-text. Some of the models that can generate text include
 GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper.
 
-Check out a few examples that use [`~transformers.generation_utils.GenerationMixin.generate`] method to produce
+Check out a few examples that use [`~generation.GenerationMixin.generate`] method to produce
 text outputs for different tasks:
 * [Text summarization](./tasks/summarization#inference)
 * [Image captioning](./model_doc/git#transformers.GitForCausalLM.forward.example)
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index af26ab1e44a0f6..fab9828ab20770 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -382,7 +382,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```
 
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to generate text.
+Use the [`~generation.GenerationMixin.generate`] method to generate text.
 For more details about the different text generation strategies and parameters for controlling generation, check out the [Text generation strategies](../generation_strategies) page.
 
 ```py
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index e9e77189d4613a..92542a774a882d 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -355,7 +355,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+Use the [`~generation.GenerationMixin.generate`] method to create the summarization. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
 
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index e7838ea6be9625..e933fda461b1ae 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -364,7 +364,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-Use the [`~transformers.generation_utils.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
+Use the [`~generation.GenerationMixin.generate`] method to create the translation. For more details about the different text generation strategies and parameters for controlling generation, check out the [Text Generation](../main_classes/text_generation) API.
 
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
diff --git a/docs/source/ja/generation_strategies.md b/docs/source/ja/generation_strategies.md
index 01a8cf203f7fb1..ff71d0c9b15d10 100644
--- a/docs/source/ja/generation_strategies.md
+++ b/docs/source/ja/generation_strategies.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 テキスト生成は、オープンエンドのテキスト生成、要約、翻訳など、多くの自然言語処理タスクに不可欠です。また、テキストを出力とするさまざまな混在モダリティアプリケーションにも影響を与えており、例えば音声からテキストへの変換や画像からテキストへの変換などがあります。テキストを生成できるいくつかのモデルには、GPT2、XLNet、OpenAI GPT、CTRL、TransformerXL、XLM、Bart、T5、GIT、Whisperが含まれます。
 
-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して、異なるタスクのテキスト出力を生成するいくつかの例をご紹介します：
+[`~generation.GenerationMixin.generate`] メソッドを使用して、異なるタスクのテキスト出力を生成するいくつかの例をご紹介します：
 * [テキスト要約](./tasks/summarization#inference)
 * [画像のキャプション](./model_doc/git#transformers.GitForCausalLM.forward.example)
 * [音声の転記](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
@@ -342,4 +342,4 @@ culture, and they allow us to design the'
 >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5)
 >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are going to the same party. It is a small party, in a small']
-```
\ No newline at end of file
+```
diff --git a/docs/source/ja/tasks/language_modeling.md b/docs/source/ja/tasks/language_modeling.md
index b65d60102ef1ca..1cadc0af0ac0a5 100644
--- a/docs/source/ja/tasks/language_modeling.md
+++ b/docs/source/ja/tasks/language_modeling.md
@@ -388,7 +388,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```
 
-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用してテキストを生成します。
+[`~generation.GenerationMixin.generate`] メソッドを使用してテキストを生成します。
 さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[テキスト生成戦略](../generation_strategies) ページを参照してください。
 
 ```py
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
index a4385f73792fc9..74152d5dbdaa5b 100644
--- a/docs/source/ja/tasks/summarization.md
+++ b/docs/source/ja/tasks/summarization.md
@@ -358,7 +358,7 @@ Tokenize the text and return the `input_ids` as PyTorch tensors:
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+[`~generation.GenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
 
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
index f683581cd1116c..eb67604f9e1e6e 100644
--- a/docs/source/ja/tasks/translation.md
+++ b/docs/source/ja/tasks/translation.md
@@ -366,7 +366,7 @@ TensorFlow でモデルを微調整するには、オプティマイザー関数
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+[`~generation.GenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
 
 
 ```py
diff --git a/docs/source/ko/generation_strategies.md b/docs/source/ko/generation_strategies.md
index fd7b9bf905aa0a..fd2ae359a54545 100644
--- a/docs/source/ko/generation_strategies.md
+++ b/docs/source/ko/generation_strategies.md
@@ -19,7 +19,7 @@ rendered properly in your Markdown viewer.
 텍스트 생성은 개방형 텍스트 작성, 요약, 번역 등 다양한 자연어 처리(NLP) 작업에 필수적입니다. 이는 또한 음성-텍스트 변환, 시각-텍스트 변환과 같이 텍스트를 출력으로 하는 여러 혼합 모달리티 응용 프로그램에서도 중요한 역할을 합니다. 텍스트 생성을 가능하게 하는 몇몇 모델로는 GPT2, XLNet, OpenAI GPT, CTRL, TransformerXL, XLM, Bart, T5, GIT, Whisper 등이 있습니다.
 
 
-[`~transformers.generation_utils.GenerationMixin.generate`] 메서드를 활용하여 다음과 같은 다양한 작업들에 대해 텍스트 결과물을 생성하는 몇 가지 예시를 살펴보세요:
+[`~generation.GenerationMixin.generate`] 메서드를 활용하여 다음과 같은 다양한 작업들에 대해 텍스트 결과물을 생성하는 몇 가지 예시를 살펴보세요:
 * [텍스트 요약](./tasks/summarization#inference)
 * [이미지 캡셔닝](./model_doc/git#transformers.GitForCausalLM.forward.example)
 * [오디오 전사](./model_doc/whisper#transformers.WhisperForConditionalGeneration.forward.example)
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
index ff2a47c24ece2a..1b531e49d6176d 100644
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -366,7 +366,7 @@ TensorFlow에서 모델을 미세 조정하려면, 먼저 옵티마이저 함수
 >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
 ```
 
-[`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
+[`~generation.GenerationMixin.generate`] 메소드를 사용하여 텍스트를 생성하세요. 생성을 제어하는 다양한 텍스트 생성 전략과 매개변수에 대한 자세한 내용은 [텍스트 생성 전략](../generation_strategies) 페이지를 확인하세요.
 
 ```py
 >>> from transformers import AutoModelForCausalLM
diff --git a/docs/source/ko/tasks/summarization.md b/docs/source/ko/tasks/summarization.md
index fc09d6a86e1fbf..501aaae7268121 100644
--- a/docs/source/ko/tasks/summarization.md
+++ b/docs/source/ko/tasks/summarization.md
@@ -21,7 +21,7 @@ rendered properly in your Markdown viewer.
 <Youtube id="yHnr5Dk2zCI"/>
 
 요약은 문서나 기사에서 중요한 정보를 모두 포함하되 짧게 만드는 일입니다.
-번역과 마찬가지로, 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 작업 중 하나입니다. 
+번역과 마찬가지로, 시퀀스-투-시퀀스 문제로 구성할 수 있는 대표적인 작업 중 하나입니다.
 요약에는 아래와 같이 유형이 있습니다:
 
 - 추출(Extractive) 요약: 문서에서 가장 관련성 높은 정보를 추출합니다.
@@ -44,7 +44,7 @@ rendered properly in your Markdown viewer.
 pip install transformers datasets evaluate rouge_score
 ```
 
-Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다. 
+Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에 공유할 수 있습니다.
 토큰을 입력하여 로그인하세요.
 
 ```py
@@ -114,14 +114,14 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 ...     return model_inputs
 ```
 
-전체 데이터셋에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메소드를 사용하세요. 
+전체 데이터셋에 전처리 함수를 적용하려면 🤗 Datasets의 [`~datasets.Dataset.map`] 메소드를 사용하세요.
 `batched=True`로 설정하여 데이터셋의 여러 요소를 한 번에 처리하면 `map` 함수의 속도를 높일 수 있습니다.
 
 ```py
 >>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
 ```
 
-이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 만드세요. 
+이제 [`DataCollatorForSeq2Seq`]를 사용하여 예제 배치를 만드세요.
 전체 데이터셋을 최대 길이로 패딩하는 것보다 배치마다 가장 긴 문장 길이에 맞춰 *동적 패딩*하는 것이 더 효율적입니다.
 
 <frameworkcontent>
@@ -143,9 +143,9 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 
 ## 평가[[evaluate]]
 
-학습 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다. 
-🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다. 
-이 작업에서는 [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) 평가 지표를 가져옵니다. 
+학습 중에 평가 지표를 포함하면 모델의 성능을 평가하는 데 도움이 되는 경우가 많습니다.
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) 라이브러리를 사용하면 평가 방법을 빠르게 불러올 수 있습니다.
+이 작업에서는 [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) 평가 지표를 가져옵니다.
 (평가 지표를 불러오고 계산하는 방법은 🤗 Evaluate [둘러보기](https://huggingface.co/docs/evaluate/a_quick_tour)를 참조하세요.)
 
 ```py
@@ -196,9 +196,9 @@ Hugging Face 계정에 로그인하면 모델을 업로드하고 커뮤니티에
 
 이제 세 단계만 남았습니다:
 
-1. [`Seq2SeqTrainingArguments`]에서 학습 하이퍼파라미터를 정의하세요. 
-유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다. 
-`push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다.) 
+1. [`Seq2SeqTrainingArguments`]에서 학습 하이퍼파라미터를 정의하세요.
+유일한 필수 매개변수는 모델을 저장할 위치를 지정하는 `output_dir`입니다.
+`push_to_hub=True`를 설정하여 이 모델을 Hub에 푸시할 수 있습니다(모델을 업로드하려면 Hugging Face에 로그인해야 합니다.)
 [`Trainer`]는 각 에폭이 끝날 때마다 ROUGE 지표를 평가하고 학습 체크포인트를 저장합니다.
 2. 모델, 데이터셋, 토크나이저, 데이터 콜레이터 및 `compute_metrics` 함수와 함께 학습 인수를 [`Seq2SeqTrainer`]에 전달하세요.
 3. [`~Trainer.train`]을 호출하여 모델을 파인튜닝하세요.
@@ -285,7 +285,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> model.compile(optimizer=optimizer)
 ```
 
-학습을 시작하기 전에 설정해야 할 마지막 두 가지는 예측에서 ROUGE 점수를 계산하고 모델을 Hub에 푸시하는 방법을 제공하는 것입니다. 
+학습을 시작하기 전에 설정해야 할 마지막 두 가지는 예측에서 ROUGE 점수를 계산하고 모델을 Hub에 푸시하는 방법을 제공하는 것입니다.
 두 작업 모두 [Keras callbacks](../main_classes/keras_callbacks)으로 수행할 수 있습니다.
 
 [`~transformers.KerasMetricCallback`]에 `compute_metrics` 함수를 전달하세요:
@@ -313,7 +313,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> callbacks = [metric_callback, push_to_hub_callback]
 ```
 
-드디어 모델 학습을 시작할 준비가 되었습니다! 
+드디어 모델 학습을 시작할 준비가 되었습니다!
 학습 및 검증 데이터셋, 에폭 수 및 콜백과 함께 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method)을 호출하여 모델을 파인튜닝하세요.
 
 ```py
@@ -326,7 +326,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 
 <Tip>
 
-요약을 위해 모델을 파인튜닝하는 방법에 대한 더 자세한 예제를 보려면 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb) 
+요약을 위해 모델을 파인튜닝하는 방법에 대한 더 자세한 예제를 보려면 [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
 또는 [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)을 참고하세요.
 
 </Tip>
@@ -341,7 +341,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
 ```
 
-추론을 위해 파인튜닝한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다. 
+추론을 위해 파인튜닝한 모델을 시험해 보는 가장 간단한 방법은 [`pipeline`]에서 사용하는 것입니다.
 모델을 사용하여 요약을 수행할 [`pipeline`]을 인스턴스화하고 텍스트를 전달하세요:
 
 ```py
@@ -366,7 +366,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-요약문을 생성하려면 [`~transformers.generation_utils.GenerationMixin.generate`] 메소드를 사용하세요. 
+요약문을 생성하려면 [`~generation.GenerationMixin.generate`] 메소드를 사용하세요.
 텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.
 
 ```py
@@ -393,7 +393,7 @@ TensorFlow에서 모델을 파인튜닝하려면, 먼저 옵티마이저, 학습
 >>> inputs = tokenizer(text, return_tensors="tf").input_ids
 ```
 
-요약문을 생성하려면 [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하세요. 
+요약문을 생성하려면 [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] 메소드를 사용하세요.
 텍스트 생성에 대한 다양한 전략과 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [텍스트 생성](../main_classes/text_generation) API를 참조하세요.
 
 ```py
diff --git a/docs/source/ko/tasks/translation.md b/docs/source/ko/tasks/translation.md
index b05ecf2d5a2cc9..88dbb405e13693 100644
--- a/docs/source/ko/tasks/translation.md
+++ b/docs/source/ko/tasks/translation.md
@@ -341,7 +341,7 @@ TensorFlow에서 모델을 파인튜닝하려면 우선 optimizer 함수, 학습
 ```py
 >>> from transformers import pipeline
 
-# Change `xx` to the language of the input and `yy` to the language of the desired output. 
+# Change `xx` to the language of the input and `yy` to the language of the desired output.
 # Examples: "en" for English, "fr" for French, "de" for German, "es" for Spanish, "zh" for Chinese, etc; translation_en_to_fr translates English to French
 # You can view all the lists of languages here - https://huggingface.co/languages
 >>> translator = pipeline("translation_xx_to_yy", model="my_awesome_opus_books_model")
@@ -362,7 +362,7 @@ TensorFlow에서 모델을 파인튜닝하려면 우선 optimizer 함수, 학습
 >>> inputs = tokenizer(text, return_tensors="pt").input_ids
 ```
 
-[`~transformers.generation_utils.GenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.
+[`~generation.GenerationMixin.generate`] 메서드로 번역을 생성하세요. 다양한 텍스트 생성 전략 및 생성을 제어하기 위한 매개변수에 대한 자세한 내용은 [Text Generation](../main_classes/text_generation) API를 살펴보시기 바랍니다.
 
 ```py
 >>> from transformers import AutoModelForSeq2SeqLM
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 7dc309ade966af..1c91361ee13c75 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -218,8 +218,7 @@ class GenerationConfig(PushToHubMixin):
             Switch to sequential beam search and sequential topk for contrastive search to reduce peak memory.
             Used with beam search and contrastive search.
 
-
-        > Parameters that define the output variables of `generate`
+        > Parameters that define the output variables of generate
 
         num_return_sequences(`int`, *optional*, defaults to 1):
             The number of independently computed returned sequences for each element in the batch.
@@ -256,25 +255,21 @@ class GenerationConfig(PushToHubMixin):
             `batch_size`. Indicating a list enables different start ids for each element in the batch
             (e.g. multilingual models with different target languages in one batch)
 
-
-        > Generation parameters exclusive to [assistant generation](https://arxiv.org/abs/2211.17192)
+        > Generation parameters exclusive to assistant generation
 
         num_assistant_tokens (`int`, *optional*, defaults to 5):
             Defines the number of _speculative tokens_ that shall be generated by the assistant model before being
             checked by the target model at each iteration. Higher values for `num_assistant_tokens` make the generation
             more _speculative_ : If the assistant model is performant larger speed-ups can be reached, if the assistant
             model requires lots of corrections, lower speed-ups are reached.
-
         num_assistant_tokens_schedule (`str`, *optional*, defaults to `"heuristic"`):
             Defines the schedule at which max assistant tokens shall be changed during inference.
             - `"heuristic"`: When all speculative tokens are correct, increase `num_assistant_tokens` by 2 else
               reduce by 1. `num_assistant_tokens` value is persistent over multiple generation calls with the same assistant model.
             - `"heuristic_transient"`: Same as `"heuristic"` but `num_assistant_tokens` is reset to its initial value after each generation call.
             - `"constant"`: `num_assistant_tokens` stays unchanged during generation
-
         prompt_lookup_num_tokens (`int`, *optional*, default to `None`):
             The number of tokens to be output as candidate tokens.
-
         max_matching_ngram_size (`int`, *optional*, default to `None`):
             The maximum ngram size to be considered for matching in the prompt. Default to 2 if not provided.
 
@@ -283,7 +278,6 @@ class GenerationConfig(PushToHubMixin):
         cache_implementation (`str`, *optional*, default to `None`):
             Cache class that should be used when generating.
 
-
         > Wild card
 
         generation_kwargs:

From 66abe1395157f8cb18830166625d886671eeb2fb Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 2 May 2024 15:20:04 +0100
Subject: [PATCH 3/5] Docs: add missing `StoppingCriteria` autodocs (#30617)

* add missing docstrings to docs

* Update src/transformers/generation/stopping_criteria.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
---
 docs/source/en/internal/generation_utils.md      |  6 ++++++
 src/transformers/__init__.py                     |  4 ++++
 src/transformers/generation/stopping_criteria.py |  2 +-
 src/transformers/utils/dummy_pt_objects.py       | 14 ++++++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index e6872efe730867..58c1d4478b02a6 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -310,6 +310,12 @@ A [`StoppingCriteria`] can be used to change when to stop generation (other than
 [[autodoc]] MaxTimeCriteria
     - __call__
 
+[[autodoc]] StopStringCriteria
+    - __call__
+
+[[autodoc]] EosTokenCriteria
+    - __call__
+
 ## Constraints
 
 A [`Constraint`] can be used to force the generation to include specific tokens or sequences in the output. Please note that this is exclusively available to our PyTorch implementations.
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 083c7f031ac6cc..53a087468e66a0 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1419,6 +1419,7 @@
             "DisjunctiveConstraint",
             "EncoderNoRepeatNGramLogitsProcessor",
             "EncoderRepetitionPenaltyLogitsProcessor",
+            "EosTokenCriteria",
             "EpsilonLogitsWarper",
             "EtaLogitsWarper",
             "ExponentialDecayLengthPenalty",
@@ -1444,6 +1445,7 @@
             "SequenceBiasLogitsProcessor",
             "StoppingCriteria",
             "StoppingCriteriaList",
+            "StopStringCriteria",
             "SuppressTokensAtBeginLogitsProcessor",
             "SuppressTokensLogitsProcessor",
             "TemperatureLogitsWarper",
@@ -6372,6 +6374,7 @@
             DisjunctiveConstraint,
             EncoderNoRepeatNGramLogitsProcessor,
             EncoderRepetitionPenaltyLogitsProcessor,
+            EosTokenCriteria,
             EpsilonLogitsWarper,
             EtaLogitsWarper,
             ExponentialDecayLengthPenalty,
@@ -6397,6 +6400,7 @@
             SequenceBiasLogitsProcessor,
             StoppingCriteria,
             StoppingCriteriaList,
+            StopStringCriteria,
             SuppressTokensAtBeginLogitsProcessor,
             SuppressTokensLogitsProcessor,
             TemperatureLogitsWarper,
diff --git a/src/transformers/generation/stopping_criteria.py b/src/transformers/generation/stopping_criteria.py
index 44c040ca6a4855..48392400c4f41c 100644
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@@ -98,7 +98,7 @@ class MaxNewTokensCriteria(StoppingCriteria):
 
     def __init__(self, start_length: int, max_new_tokens: int):
         warnings.warn(
-            "The class `MaxNewTokensCriteria` is deprecated. "
+            "The class `MaxNewTokensCriteria` is deprecated and will be removed in v4.43. "
             f"Please use `MaxLengthCriteria(max_length={start_length + max_new_tokens})` "
             "with `max_length = start_length + max_new_tokens` instead.",
             FutureWarning,
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index f91bbbe4fc44c0..d5c64cc141d268 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -177,6 +177,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class EosTokenCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class EpsilonLogitsWarper(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -352,6 +359,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class StopStringCriteria(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class SuppressTokensAtBeginLogitsProcessor(metaclass=DummyObject):
     _backends = ["torch"]
 

From 9719202d37492a43c323d03a81aed23d14d98dec Mon Sep 17 00:00:00 2001
From: Joao Gante <joaofranciscocardosogante@gmail.com>
Date: Thu, 2 May 2024 15:24:33 +0100
Subject: [PATCH 4/5] Generate: fix `SinkCache` on Llama models (#30581)

---
 src/transformers/cache_utils.py | 27 ++++++++++++++++++++++-----
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
index ceca9d3eeb3592..2e29e19ade46a4 100644
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@@ -207,7 +207,9 @@ def __init__(self, window_length: int, num_sink_tokens: int) -> None:
         self.value_cache: List[torch.Tensor] = []
         self.window_length = window_length
         self.num_sink_tokens = num_sink_tokens
-        self.cos_sin_cache = {}
+        self.cos_sin_rerotation_cache = {}
+        self._cos_cache = None
+        self._sin_cache = None
         self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
 
     @staticmethod
@@ -225,7 +227,7 @@ def _apply_key_rotary_pos_emb(
     def _get_rerotation_cos_sin(
         self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if key_states.shape[-2] not in self.cos_sin_cache:
+        if key_states.shape[-2] not in self.cos_sin_rerotation_cache:
             # Upcast to float32 temporarily for better accuracy
             cos = cos.to(torch.float32)
             sin = sin.to(torch.float32)
@@ -238,11 +240,11 @@ def _get_rerotation_cos_sin(
             rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
             rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin
 
-            self.cos_sin_cache[key_states.shape[-2]] = (
+            self.cos_sin_rerotation_cache[key_states.shape[-2]] = (
                 rerotation_cos.to(key_states.dtype).unsqueeze(0),
                 rerotation_sin.to(key_states.dtype).unsqueeze(0),
             )
-        return self.cos_sin_cache[key_states.shape[-2]]
+        return self.cos_sin_rerotation_cache[key_states.shape[-2]]
 
     def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
         """Returns the sequence length of the cached states. A layer index can be optionally passed."""
@@ -292,6 +294,21 @@ def update(
         if layer_idx == 0:
             self._seen_tokens += key_states.shape[-2]
 
+        # Update the sin/cos cache, which holds sin/cos values for all possible positions
+        if using_rope and layer_idx == 0:
+            # BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove
+            # after all RoPE models have a llama-like cache utilization.
+            if cos.dim() == 2:
+                self._cos_cache = cos
+                self._sin_cache = sin
+            else:
+                if self._cos_cache is None:
+                    self._cos_cache = cos[0, ...]
+                    self._sin_cache = sin[0, ...]
+                elif self._cos_cache.shape[0] < self.window_length:
+                    self._cos_cache = torch.cat([self._cos_cache, cos[0, ...]], dim=0)
+                    self._sin_cache = torch.cat([self._sin_cache, sin[0, ...]], dim=0)
+
         # [bsz, num_heads, seq_len, head_dim]
         if len(self.key_cache) <= layer_idx:
             # Empty cache
@@ -312,7 +329,7 @@ def update(
             # On RoPE models, we need to recompute the Key rotation as the tokens are shifted
             if using_rope:
                 rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
-                    key_states, cos[: self.window_length], sin[: self.window_length]
+                    key_states, self._cos_cache[: self.window_length], self._sin_cache[: self.window_length]
                 )
                 if partial_rotation_size is not None:
                     keys_to_keep, keys_pass = (

From 39359e5b5f6246c667cc25f743437cb05d492ea7 Mon Sep 17 00:00:00 2001
From: Michael Benayoun <mickbenayoun@gmail.com>
Date: Thu, 2 May 2024 17:03:10 +0200
Subject: [PATCH 5/5] Fix FX tracing issues for Llama (#30619)

---
 src/transformers/utils/fx.py | 41 +++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 0faf7e0d6ea956..b19efac1306cc5 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -714,9 +714,14 @@ class HFCacheProxy(HFProxy):
     Proxy that represents an instance of `transformers.cache_utils.Cache`.
     """
 
+    def install_orig_cache_cls(self, orig_cache_cls: Type[Cache]):
+        self._orig_cache_cls = orig_cache_cls
+
     @property
     def __class__(self):
-        return ProxyableCache
+        if not hasattr(self, "_orig_cache_cls"):
+            raise RuntimeError("The original Cache class must be installed to the HFCacheProxy.")
+        return self.tracer._CLASSES_TO_PATCH[self._orig_cache_cls]
 
 
 def create_wrapper(
@@ -806,23 +811,39 @@ def _proxies_to_metas(v):
     return v
 
 
-def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
-    global _CURRENT_TRACER
-    if not isinstance(_CURRENT_TRACER, HFTracer):
-        raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
-    return HFCacheProxy(n, _CURRENT_TRACER)
+def create_cache_proxy_factory_fn(orig_cache_cls: Type[Cache]) -> Callable[[Node], HFCacheProxy]:
+    def cache_proxy_factory_fn(n: Node) -> HFCacheProxy:
+        global _CURRENT_TRACER
+        if not isinstance(_CURRENT_TRACER, HFTracer):
+            raise RuntimeError("Cannot create HFCacheProxy because there is no HFTracer currently tracing.")
+        cache_proxy = HFCacheProxy(n, _CURRENT_TRACER)
+        cache_proxy.install_orig_cache_cls(orig_cache_cls)
+        return cache_proxy
+
+    return cache_proxy_factory_fn
 
 
 # Proxyable equivalent of the cache classes defined in `transformers.cache_utils`.
-ProxyableCache = HFProxyableClassMeta("ProxyableCache", (Cache,), {}, proxy_factory_fn=cache_proxy_factory_fn)
+ProxyableCache = HFProxyableClassMeta(
+    "ProxyableCache", (Cache,), {}, proxy_factory_fn=create_cache_proxy_factory_fn(Cache)
+)
 ProxyableDynamicCache = HFProxyableClassMeta(
-    "ProxyableDynamicCache", (DynamicCache,), {}, proxy_factory_fn=cache_proxy_factory_fn
+    "ProxyableDynamicCache",
+    (DynamicCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(DynamicCache),
 )
 ProxyableSinkCache = HFProxyableClassMeta(
-    "ProxyableSinkCache", (SinkCache,), {}, proxy_factory_fn=cache_proxy_factory_fn
+    "ProxyableSinkCache",
+    (SinkCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(SinkCache),
 )
 ProxyableStaticCache = HFProxyableClassMeta(
-    "ProxyableStaticCache", (StaticCache,), {}, proxy_factory_fn=cache_proxy_factory_fn
+    "ProxyableStaticCache",
+    (StaticCache,),
+    {},
+    proxy_factory_fn=create_cache_proxy_factory_fn(StaticCache),
 )