diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml
index b267ad7882d89f..be070a95d3a94f 100644
--- a/.github/workflows/build-docker-images.yml
+++ b/.github/workflows/build-docker-images.yml
@@ -271,3 +271,39 @@ jobs:
             REF=main
           push: true
           tags: huggingface/transformers-tensorflow-gpu
+
+  # latest-pytorch-deepspeed-amd:
+  #   name: "PyTorch + DeepSpeed (AMD) [dev]"
+
+  #   runs-on: [self-hosted, docker-gpu, amd-gpu, single-gpu, mi210]
+  #   steps:
+  #     - name: Set up Docker Buildx
+  #       uses: docker/setup-buildx-action@v3
+  #     - name: Check out code
+  #       uses: actions/checkout@v3
+  #     - name: Login to DockerHub
+  #       uses: docker/login-action@v3
+  #       with:
+  #         username: ${{ secrets.DOCKERHUB_USERNAME }}
+  #         password: ${{ secrets.DOCKERHUB_PASSWORD }}
+  #     - name: Build and push
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }}
+  #     # Push CI images still need to be re-built daily
+  #     -
+  #       name: Build and push (for Push CI) in a daily basis
+  #       # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`.
+  #       # The later case is useful for manual image building for debugging purpose. Use another tag in this case!
+  #       if: inputs.image_postfix != '-push-ci'
+  #       uses: docker/build-push-action@v5
+  #       with:
+  #         context: ./docker/transformers-pytorch-deepspeed-amd-gpu
+  #         build-args: |
+  #           REF=main
+  #         push: true
+  #         tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci
diff --git a/.github/workflows/check_runner_status.yml b/.github/workflows/check_runner_status.yml
deleted file mode 100644
index 328d284223a857..00000000000000
--- a/.github/workflows/check_runner_status.yml
+++ /dev/null
@@ -1,68 +0,0 @@
-name: Self-hosted runner (check runner status)
-
-# Note that each job's dependencies go into a corresponding docker file.
-#
-# For example for `run_all_tests_torch_cuda_extensions_gpu` the docker image is
-# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at
-# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile`
-
-on:
-  repository_dispatch:
-  schedule:
-    # run per hour
-    - cron: "0 */1 * * *"
-
-env:
-  TRANSFORMERS_IS_CI: yes
-
-jobs:
-  check_runner_status:
-    name: Check Runner Status
-    runs-on: ubuntu-22.04
-    outputs:
-      offline_runners: ${{ steps.set-offline_runners.outputs.offline_runners }}
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker,single-gpu-scheduled-ci-runner-docker,multi-scheduled-scheduled-ci-runner-docker,single-gpu-doctest-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-
-      - id: set-offline_runners
-        name: Set output for offline runners
-        if: ${{ always() }}
-        run: |
-          offline_runners=$(python3 -c 'fp = open("offline_runners.txt"); failed = fp.read(); fp.close(); print(failed)')
-          echo "offline_runners=$offline_runners" >> $GITHUB_OUTPUT
-
-  send_results:
-    name: Send results to webhook
-    runs-on: ubuntu-22.04
-    needs: check_runner_status
-    if: ${{ failure() }}
-    steps:
-      - name: Preliminary job status
-        shell: bash
-        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
-
-      - uses: actions/checkout@v3
-      - uses: actions/download-artifact@v3
-      - name: Send message to Slack
-        env:
-          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
-          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
-          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
-          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
-          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: runner status check
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          OFFLINE_RUNNERS: ${{ needs.check_runner_status.outputs.offline_runners }}
-        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
-        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
-        run: |
-          pip install slack_sdk
-          python utils/notification_service.py
diff --git a/.github/workflows/delete_doc_comment.yml b/.github/workflows/delete_doc_comment.yml
deleted file mode 100644
index 8604019d76eb50..00000000000000
--- a/.github/workflows/delete_doc_comment.yml
+++ /dev/null
@@ -1,14 +0,0 @@
-name: Delete doc comment
-
-on:
-  workflow_run:
-    workflows: ["Delete doc comment trigger"]
-    types:
-      - completed
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
-    secrets:
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/delete_doc_comment_trigger.yml b/.github/workflows/delete_doc_comment_trigger.yml
deleted file mode 100644
index f87d9bd4dca705..00000000000000
--- a/.github/workflows/delete_doc_comment_trigger.yml
+++ /dev/null
@@ -1,12 +0,0 @@
-name: Delete doc comment trigger
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml
index e4b4f7f77cf077..37dc98f340a16d 100644
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -212,7 +212,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -286,4 +286,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml
index 6a154544df8b97..ed60c92f6745a8 100644
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@@ -88,6 +88,10 @@ jobs:
         working-directory: /transformers
         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
 
+      - name: Update some packages
+        working-directory: /transformers
+        run: python3 -m pip install -U datasets
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -164,6 +168,10 @@ jobs:
         working-directory: /transformers
         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
 
+      - name: Update some packages
+        working-directory: /transformers
+        run: python3 -m pip install -U datasets
+
       - name: Echo folder ${{ matrix.folders }}
         shell: bash
         # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
@@ -240,6 +248,10 @@ jobs:
         working-directory: /transformers
         run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
 
+      - name: Update some packages
+        working-directory: /transformers
+        run: python3 -m pip install -U datasets
+
       - name: Install
         working-directory: /transformers
         run: |
@@ -255,7 +267,7 @@ jobs:
           python3 -m pip uninstall -y deepspeed
           rm -rf DeepSpeed
           git clone https://github.com/microsoft/DeepSpeed && cd DeepSpeed && rm -rf build
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -341,4 +353,4 @@ jobs:
         with:
           name: |
               single-*
-              multi-*
\ No newline at end of file
+              multi-*
diff --git a/.github/workflows/self-push-amd-mi210-caller.yml b/.github/workflows/self-push-amd-mi210-caller.yml
index 5dd010ef66d8fb..2e6b9301d8d72a 100644
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@@ -18,7 +18,7 @@ on:
 jobs:
   run_amd_ci:
     name: AMD mi210
-    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
     uses: ./.github/workflows/self-push-amd.yml
     with:
       gpu_flavor: mi210
diff --git a/.github/workflows/self-push-amd-mi250-caller.yml b/.github/workflows/self-push-amd-mi250-caller.yml
index a55378c4caa54b..412fb8f08870e3 100644
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@@ -18,7 +18,7 @@ on:
 jobs:
   run_amd_ci:
     name: AMD mi250
-    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
+    if: (cancelled() != true) && ((github.event_name == 'push') && (github.ref_name == 'main' || startsWith(github.ref_name, 'run_amd_push_ci_caller')))
     uses: ./.github/workflows/self-push-amd.yml
     with:
       gpu_flavor: mi250
diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml
index c72f224a300cc8..19857981b12dbd 100644
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@@ -38,14 +38,16 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
         run: |
           rocminfo  | grep "Agent" -A 14
-      - name: Show HIP environment
+      - name: Show ROCR environment
         run: |
-          echo "HIP: $HIP_VISIBLE_DEVICES"
           echo "ROCR: $ROCR_VISIBLE_DEVICES"
 
   setup_gpu:
@@ -57,7 +59,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     outputs:
       matrix: ${{ steps.set-matrix.outputs.matrix }}
       test_map: ${{ steps.set-matrix.outputs.test_map }}
@@ -155,7 +157,7 @@ jobs:
     runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
     container:
       image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
-      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
     steps:
       # Necessary to get the correct branch name and commit SHA for `workflow_run` event
       # We also take into account the `push` event (we might want to test some changes in a branch)
@@ -206,11 +208,13 @@ jobs:
           echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
 
       - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
         run: |
           rocminfo  | grep "Agent" -A 14
-      - name: Show HIP environment
+      - name: Show ROCR environment
         run: |
-          echo "HIP: $HIP_VISIBLE_DEVICES"
           echo "ROCR: $ROCR_VISIBLE_DEVICES"
 
       - name: Environment
diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml
index a6ea5b1e04b942..e6f1f3b3050f7a 100644
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -456,7 +456,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
diff --git a/.github/workflows/self-scheduled-amd-caller.yml b/.github/workflows/self-scheduled-amd-caller.yml
new file mode 100644
index 00000000000000..dc5c7b7e905bd8
--- /dev/null
+++ b/.github/workflows/self-scheduled-amd-caller.yml
@@ -0,0 +1,14 @@
+name: Self-hosted runner (AMD scheduled CI caller)
+
+on:
+  schedule:
+    - cron: "17 2 * * *"
+
+jobs:
+  run_scheduled_amd_ci:
+    name: Trigger Scheduled AMD CI
+    runs-on: ubuntu-22.04
+    if: ${{ always() }}
+    steps:
+      - name: Trigger scheduled AMD CI via workflow_run
+        run: echo "Trigger scheduled AMD CI via workflow_run"
diff --git a/.github/workflows/self-scheduled-amd-mi210-caller.yml b/.github/workflows/self-scheduled-amd-mi210-caller.yml
new file mode 100644
index 00000000000000..cdb968901058b6
--- /dev/null
+++ b/.github/workflows/self-scheduled-amd-mi210-caller.yml
@@ -0,0 +1,19 @@
+name: Self-hosted runner (AMD mi210 scheduled CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  run_amd_ci:
+    name: AMD mi210
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      gpu_flavor: mi210
+    secrets: inherit
diff --git a/.github/workflows/self-scheduled-amd-mi250-caller.yml b/.github/workflows/self-scheduled-amd-mi250-caller.yml
new file mode 100644
index 00000000000000..dc7d12f173935e
--- /dev/null
+++ b/.github/workflows/self-scheduled-amd-mi250-caller.yml
@@ -0,0 +1,19 @@
+name: Self-hosted runner (AMD mi250 scheduled CI caller)
+
+on:
+  workflow_run:
+    workflows: ["Self-hosted runner (AMD scheduled CI caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - run_amd_scheduled_ci_caller*
+
+jobs:
+  run_amd_ci:
+    name: AMD mi250
+    if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_scheduled_ci_caller')))
+    uses: ./.github/workflows/self-scheduled-amd.yml
+    with:
+      gpu_flavor: mi250
+    secrets: inherit
diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml
new file mode 100644
index 00000000000000..3d41a3b95e6c50
--- /dev/null
+++ b/.github/workflows/self-scheduled-amd.yml
@@ -0,0 +1,518 @@
+name: Self-hosted runner (scheduled-amd)
+
+# Note: For the AMD CI, we rely on a caller workflow and on the workflow_call event to trigger the
+# CI in order to run it on both MI210 and MI250, without having to use matrix here which pushes
+# us towards the limit of allowed jobs on GitHub Actions.
+on:
+  workflow_call:
+    inputs:
+      gpu_flavor:
+        required: true
+        type: string
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+
+
+# Important note: each job (run_tests_single_gpu, run_tests_multi_gpu, run_examples_gpu, run_pipelines_torch_gpu) requires all the previous jobs before running.
+# This is done so that we avoid parallelizing the scheduled tests, to leave available
+# runners for the push CI that is running on the same machine.
+jobs:
+  check_runner_status:
+    name: Check Runner Status
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Check Runner Status
+        run: python utils/check_self_hosted_runner.py --target_runners hf-amd-mi210-ci-1gpu-1,hf-amd-mi250-ci-1gpu-1 --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+
+  check_runners:
+    name: Check Runners
+    needs: check_runner_status
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+  setup:
+    name: Setup
+    needs: check_runners
+    strategy:
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: |
+          git fetch && git checkout ${{ github.sha }}
+
+      - name: Cleanup
+        working-directory: /transformers
+        run: |
+          rm -rf tests/__pycache__
+          rm -rf tests/models/__pycache__
+          rm -rf reports
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - id: set-matrix
+        name: Identify models to test
+        working-directory: /transformers/tests
+        run: |
+          echo "matrix=$(python3 -c 'import os; tests = os.getcwd(); model_tests = os.listdir(os.path.join(tests, "models")); d1 = sorted(list(filter(os.path.isdir, os.listdir(tests)))); d2 = sorted(list(filter(os.path.isdir, [f"models/{x}" for x in model_tests]))); d1.remove("models"); d = d2 + d1; print(d)')" >> $GITHUB_OUTPUT
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+  run_tests_single_gpu:
+    name: Single GPU tests
+    strategy:
+      max-parallel: 1  # For now, not to parallelize. Can change later if it works well.
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_tests_multi_gpu:
+    name: Multi GPU tests
+    strategy:
+      max-parallel: 1
+      fail-fast: false
+      matrix:
+        folders: ${{ fromJson(needs.setup.outputs.matrix) }}
+        machine_type: [multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Echo folder ${{ matrix.folders }}
+        shell: bash
+        # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to
+        # set the artifact folder names (because the character `/` is not allowed).
+        run: |
+          echo "${{ matrix.folders }}"
+          matrix_folders=${{ matrix.folders }}
+          matrix_folders=${matrix_folders/'models/'/'models_'}
+          echo "$matrix_folders"
+          echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} tests/${{ matrix.folders }}
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }}
+
+  run_examples_gpu:
+    name: Examples tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run examples tests on GPU
+        working-directory: /transformers
+        run: |
+          pip install -r examples/pytorch/_tests_requirements.txt
+          python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_examples_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu
+
+  run_pipelines_torch_gpu:
+    name: PyTorch pipelines tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    container:
+      image: huggingface/transformers-pytorch-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    needs: setup
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all pipeline tests on GPU
+        working-directory: /transformers
+        run: |
+          python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ matrix.machine_type }}_tests_torch_pipeline_gpu tests/pipelines
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu
+
+  run_tests_torch_deepspeed_gpu:
+    name: Torch ROCm deepspeed tests
+    strategy:
+      fail-fast: false
+      matrix:
+        machine_type: [single-gpu, multi-gpu]
+
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+    needs: setup
+    container:
+      image: huggingface/transformers-pytorch-deepspeed-amd-gpu
+      options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: ROCM-SMI
+        run: |
+          rocm-smi
+      - name: ROCM-INFO
+        run: |
+          rocminfo  | grep "Agent" -A 14
+
+      - name: Show ROCR environment
+        run: |
+          echo "ROCR: $ROCR_VISIBLE_DEVICES"
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Run all tests on GPU
+        working-directory: /transformers
+        run: python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_torch_deepspeed_gpu tests/deepspeed tests/extended
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        continue-on-error: true
+        run: cat /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu/failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports
+          path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu
+
+  run_extract_warnings:
+    name: Extract warnings in CI artifacts
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
+      run_tests_torch_deepspeed_gpu
+    ]
+    steps:
+      - name: Checkout transformers
+        uses: actions/checkout@v3
+        with:
+          fetch-depth: 2
+
+      - name: Install transformers
+        run: pip install transformers
+
+      - name: Show installed libraries and their versions
+        run: pip freeze
+
+      - name: Create output directory
+        run: mkdir warnings_in_ci
+
+      - uses: actions/download-artifact@v3
+        with:
+          path: warnings_in_ci
+
+      - name: Show artifacts
+        run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')"
+        working-directory: warnings_in_ci
+
+      - name: Extract warnings in CI artifacts
+        run: |
+          python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh
+          echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')"
+
+      - name: Upload artifact
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: warnings_in_ci
+          path: warnings_in_ci/selected_warnings.json
+
+  send_results:
+    name: Send results to webhook
+    runs-on: ubuntu-22.04
+    if: always()
+    needs: [
+      check_runner_status,
+      check_runners,
+      setup,
+      run_tests_single_gpu,
+      run_tests_multi_gpu,
+      run_examples_gpu,
+      run_pipelines_torch_gpu,
+      run_tests_torch_deepspeed_gpu,
+      run_extract_warnings
+    ]
+    steps:
+      - name: Preliminary job status
+        shell: bash
+        # For the meaning of these environment variables, see the job `Setup`
+        run: |
+          echo "Runner availability: ${{ needs.check_runner_status.result }}"
+          echo "Runner status: ${{ needs.check_runners.result }}"
+          echo "Setup status: ${{ needs.setup.result }}"
+
+      - uses: actions/checkout@v3
+      - uses: actions/download-artifact@v3
+      - name: Send message to Slack
+        env:
+          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+          CI_SLACK_CHANNEL_ID_DAILY_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_AMD }}
+          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
+          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_AMD }}
+          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
+          CI_EVENT: Scheduled CI (AMD) - ${{ inputs.gpu_flavor }}
+          CI_SHA: ${{ github.sha }}
+          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
+          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
+          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
+          SETUP_STATUS: ${{ needs.setup.result }}
+        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
+        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
+        run: |
+          sudo apt-get install -y curl
+          pip install slack_sdk
+          pip show slack_sdk
+          python utils/notification_service.py "${{ needs.setup.outputs.matrix }}"
+
+      # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack.
+      - name: Failure table artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v3
+        with:
+          name: test_failure_tables
+          path: test_failure_tables
diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml
index 4a04cb14ac7bb3..995df2e07880ac 100644
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -366,7 +366,7 @@ jobs:
         working-directory: /workspace
         run: |
           python3 -m pip uninstall -y deepspeed
-          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
+          DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check
 
       - name: NVIDIA-SMI
         run: |
@@ -494,5 +494,5 @@ jobs:
         if: ${{ always() }}
         uses: actions/upload-artifact@v3
         with:
-          name: test_failure_tables
-          path: test_failure_tables
+          name: prev_ci_results
+          path: prev_ci_results
diff --git a/ISSUES.md b/ISSUES.md
index 95f2334b26c803..a5969a3027f86d 100644
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -152,7 +152,7 @@ You are not required to read the following guidelines before opening an issue. H
 
    ```bash
     cd examples/seq2seq
-    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    torchrun --nproc_per_node=2 ./finetune_trainer.py \
     --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
     --output_dir output_dir --overwrite_output_dir \
     --do_train --n_train 500 --num_train_epochs 1 \
diff --git a/README.md b/README.md
index 12724e60a1881d..4598868474b4c3 100644
--- a/README.md
+++ b/README.md
@@ -397,12 +397,14 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
@@ -415,6 +417,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
@@ -439,6 +442,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from  IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@@ -464,6 +469,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -489,14 +495,17 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim.
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
diff --git a/README_es.md b/README_es.md
index 5cdbc27ec7918d..52a35cfb96a948 100644
--- a/README_es.md
+++ b/README_es.md
@@ -372,12 +372,14 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
@@ -390,6 +392,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
@@ -414,6 +417,8 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from  IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@@ -439,6 +444,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -464,14 +470,17 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
diff --git a/README_hd.md b/README_hd.md
index 01937532f967c4..c19b944c609189 100644
--- a/README_hd.md
+++ b/README_hd.md
@@ -346,12 +346,14 @@ conda install -c huggingface transformers
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (दक्षिण चीन प्रौद्योगिकी विश्वविद्यालय से) साथ में कागज [LiLT: एक सरल लेकिन प्रभावी भाषा-स्वतंत्र लेआउट ट्रांसफार्मर संरचित दस्तावेज़ समझ के लिए](https://arxiv.org/abs/2202.13669) जियापेंग वांग, लियानवेन जिन, काई डिंग द्वारा पोस्ट किया गया।
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI से) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. द्वाराअनुसंधान पत्र [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) के साथ जारी किया गया
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI से) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. द्वाराअनुसंधान पत्र [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) के साथ जारी किया गया
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison से) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. द्वाराअनुसंधान पत्र [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) के साथ जारी किया गया
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (मैंडी गुओ, जोशुआ आइंस्ली, डेविड यूथस, सैंटियागो ओंटानन, जियानमो नि, यूं-हुआन सुंग, यिनफेई यांग द्वारा पोस्ट किया गया।
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (स्टूडियो औसिया से) साथ में पेपर [LUKE: डीप कॉन्टेक्स्टुअलाइज्ड एंटिटी रिप्रेजेंटेशन विद एंटिटी-अवेयर सेल्फ-अटेंशन](https ://arxiv.org/abs/2010.01057) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto द्वारा।
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC चैपल हिल से) साथ में पेपर [LXMERT: ओपन-डोमेन क्वेश्चन के लिए ट्रांसफॉर्मर से क्रॉस-मोडलिटी एनकोडर रिप्रेजेंटेशन सीखना Answering](https://arxiv.org/abs/1908.07490) हाओ टैन और मोहित बंसल द्वारा।
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (फेसबुक से) साथ देने वाला पेपर [बियॉन्ड इंग्लिश-सेंट्रिक मल्टीलिंगुअल मशीन ट्रांसलेशन](https://arxiv.org/ एब्स/2010.11125) एंजेला फैन, श्रुति भोसले, होल्गर श्वेन्क, झी मा, अहमद अल-किश्की, सिद्धार्थ गोयल, मनदीप बैनेस, ओनूर सेलेबी, गुइल्लाम वेन्जेक, विश्रव चौधरी, नमन गोयल, टॉम बर्च, विटाली लिपचिंस्की, सर्गेई एडुनोव, एडौर्ड द्वारा ग्रेव, माइकल औली, आर्मंड जौलिन द्वारा पोस्ट किया गया।
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg द्वारा [OPUS](http://opus.nlpl.eu/) डेटा से प्रशिक्षित मशीनी अनुवाद मॉडल पोस्ट किया गया टाइडेमैन द्वारा। [मैरियन फ्रेमवर्क](https://marian-nmt.github.io/) माइक्रोसॉफ्ट ट्रांसलेटर टीम द्वारा विकसित।
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ में पेपर [मार्कअपएलएम: विजुअली-रिच डॉक्यूमेंट अंडरस्टैंडिंग के लिए टेक्स्ट और मार्कअप लैंग्वेज का प्री-ट्रेनिंग] (https://arxiv.org/abs/2110.08518) जुनलॉन्ग ली, यिहेंग जू, लेई कुई, फुरु द्वारा वी द्वारा पोस्ट किया गया।
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC से) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. द्वाराअनुसंधान पत्र [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) के साथ जारी किया गया
@@ -364,6 +366,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA से) साथ वाला पेपर [Megatron-LM: ट्रेनिंग मल्टी-बिलियन पैरामीटर लैंग्वेज मॉडल्स यूजिंग मॉडल पैरेललिज़्म] (https://arxiv.org/abs/1909.08053) मोहम्मद शोएबी, मोस्टोफा पटवारी, राउल पुरी, पैट्रिक लेग्रेस्ले, जेरेड कैस्पर और ब्रायन कैटानज़ारो द्वारा पोस्ट किया गया।
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research से) Peng Wang, Cheng Da, and Cong Yao. द्वाराअनुसंधान पत्र [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) के साथ जारी किया गया
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (फ्रॉम Studio Ousia) साथ में पेपर [mLUKE: द पावर ऑफ एंटिटी रिप्रेजेंटेशन इन मल्टीलिंगुअल प्रीट्रेन्ड लैंग्वेज मॉडल्स](https://arxiv.org/abs/2110.08151) रयोकन री, इकुया यामाडा, और योशिमासा त्सुरोका द्वारा।
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook से) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. द्वाराअनुसंधान पत्र [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) के साथ जारी किया गया
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [मोबाइलबर्ट: संसाधन-सीमित उपकरणों के लिए एक कॉम्पैक्ट टास्क-अज्ञेय बीईआरटी] (https://arxiv.org/abs/2004.02984) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, और Denny Zhou द्वारा पोस्ट किया गया।
@@ -388,6 +391,8 @@ conda install -c huggingface transformers
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI से) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. द्वाराअनुसंधान पत्र [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) के साथ जारी किया गया
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** ( IBM Research से) Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) के साथ जारी किया गया
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM से) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. द्वाराअनुसंधान पत्र [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) के साथ जारी किया गया
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
@@ -413,6 +418,7 @@ conda install -c huggingface transformers
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
@@ -438,14 +444,17 @@ conda install -c huggingface transformers
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU की ओर से) कागज के साथ [संस्करण-एक्स: एक ब्लॉग मॉडल चौकस चौक मॉडल मॉडल] (https://arxivorg/abs/1901.02860) क्वोकोक वी. ले, रुस्लैन सलाखुतदी
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research से) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. द्वाराअनुसंधान पत्र [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) के साथ जारी किया गया
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (माइक्रोसॉफ्ट रिसर्च से) साथ में दिया गया पेपर [UniSpeech: यूनिफाइड स्पीच रिप्रेजेंटेशन लर्निंग विद लेबलेड एंड अनलेबल्ड डेटा](https:/ /arxiv.org/abs/2101.07597) चेंगई वांग, यू वू, याओ कियान, केनिची कुमातानी, शुजी लियू, फुरु वेई, माइकल ज़ेंग, ज़ुएदोंग हुआंग द्वारा।
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [UNISPEECH-SAT: यूनिवर्सल स्पीच रिप्रेजेंटेशन लर्निंग विद स्पीकर अवेयर प्री-ट्रेनिंग ](https://arxiv.org/abs/2110.05752) सानयुआन चेन, यू वू, चेंग्यी वांग, झेंगयांग चेन, झूओ चेन, शुजी लियू, जियान वू, याओ कियान, फुरु वेई, जिन्यु ली, जियांगज़ान यू द्वारा पोस्ट किया गया।
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (सिंघुआ यूनिवर्सिटी और ननकाई यूनिवर्सिटी से) साथ में पेपर [विजुअल अटेंशन नेटवर्क](https://arxiv.org/ pdf/2202.09741.pdf) मेंग-हाओ गुओ, चेंग-ज़े लू, झेंग-निंग लियू, मिंग-मिंग चेंग, शि-मिन हू द्वारा।
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (मल्टीमीडिया कम्प्यूटिंग ग्रुप, नानजिंग यूनिवर्सिटी से) साथ में पेपर [वीडियोएमएई: मास्क्ड ऑटोएन्कोडर स्व-पर्यवेक्षित वीडियो प्री-ट्रेनिंग के लिए डेटा-कुशल सीखने वाले हैं] (https://arxiv.org/abs/2203.12602) ज़ान टोंग, यिबिंग सॉन्ग, जुए द्वारा वांग, लिमिन वांग द्वारा पोस्ट किया गया।
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain से) साथ में कागज [ViLT: Vision-and-Language Transformer बिना कनवल्शन या रीजन सुपरविजन](https://arxiv.org/abs/2102.03334) वोनजे किम, बोक्यूंग सोन, इल्डू किम द्वारा पोस्ट किया गया।
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (University of Wisconsin–Madison से) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. द्वाराअनुसंधान पत्र [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) के साथ जारी किया गया
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (गूगल एआई से) कागज के साथ [एक इमेज इज़ वर्थ 16x16 वर्ड्स: ट्रांसफॉर्मर्स फॉर इमेज रिकॉग्निशन एट स्केल](https://arxiv.org/abs/2010.11929) एलेक्सी डोसोवित्स्की, लुकास बेयर, अलेक्जेंडर कोलेसनिकोव, डिर्क वीसेनबोर्न, शियाओहुआ झाई, थॉमस अनटरथिनर, मुस्तफा देहघानी, मैथियास मिंडरर, जॉर्ज हेगोल्ड, सिल्वेन गेली, जैकब उस्ज़कोरेइट द्वारा हॉल्सबी द्वारा पोस्ट किया गया।
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP से) साथ वाला पेपर [VisualBERT: A Simple and Performant Baseline for Vision and Language](https:/ /arxiv.org/pdf/1908.03557) लियुनियन हेरोल्ड ली, मार्क यात्स्कर, दा यिन, चो-जुई हसीह, काई-वेई चांग द्वारा।
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
diff --git a/README_ja.md b/README_ja.md
index 5935da396bf165..f54d1c54b5a73d 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -406,12 +406,14 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology から) Jiapeng Wang, Lianwen Jin, Kai Ding から公開された研究論文: [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669)
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI から) Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample. から公開された研究論文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI から) Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.. から公開された研究論文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison から) Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee. から公開された研究論文 [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485)
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI から) Iz Beltagy, Matthew E. Peters, Arman Cohan から公開された研究論文: [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150)
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI から) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang から公開された研究論文: [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916)
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia から) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto から公開された研究論文: [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057)
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill から) Hao Tan and Mohit Bansal から公開された研究論文: [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490)
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook から) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert から公開された研究論文: [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161)
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook から) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin から公開された研究論文: [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125)
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Jörg Tiedemann から. [OPUS](http://opus.nlpl.eu/) を使いながら学習された "Machine translation" (マシントランスレーション) モデル. [Marian Framework](https://marian-nmt.github.io/) はMicrosoft Translator Team　が現在開発中です.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia から) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei から公開された研究論文: [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518)
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC から) Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar. から公開された研究論文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
@@ -424,6 +426,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA から) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro から公開された研究論文: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053)
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research から) Peng Wang, Cheng Da, and Cong Yao. から公開された研究論文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia から) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka から公開された研究論文: [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151)
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook から) Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. から公開された研究論文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain から) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou から公開された研究論文: [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984)
@@ -448,6 +451,8 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. から公開された研究論文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** ( IBM Research から) Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf)
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM から) Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam. から公開された研究論文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
@@ -473,6 +478,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
@@ -498,14 +504,17 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU から) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov から公開された研究論文: [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860)
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft から), Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei から公開された研究論文: [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282)
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill から), Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal から公開された研究論文: [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156)
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (Intel から), Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding から公開された研究論文: [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995)
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research から) Yi Tay, Mostafa Dehghani, Vinh Q から公開された研究論文: [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research から) Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant. から公開された研究論文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research から) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang から公開された研究論文: [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597)
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research から) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu から公開された研究論文: [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752)
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University から) Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun. から公開された研究論文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University から) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu から公開された研究論文: [Visual Attention Network](https://arxiv.org/abs/2202.09741)
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University から) Zhan Tong, Yibing Song, Jue Wang, Limin Wang から公開された研究論文: [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602)
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain から) Wonjae Kim, Bokyung Son, Ildoo Kim から公開された研究論文: [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334)
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (University of Wisconsin–Madison から) Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee. から公開された研究論文 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP から) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang から公開された研究論文: [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557)
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI から) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby から公開された研究論文: [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929)
diff --git a/README_ko.md b/README_ko.md
index e0c38472cc4606..a039331b93d085 100644
--- a/README_ko.md
+++ b/README_ko.md
@@ -321,12 +321,14 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (South China University of Technology 에서) Jiapeng Wang, Lianwen Jin, Kai Ding 의 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 논문과 함께 발표했습니다.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.의 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971)논문과 함께 발표했습니다.
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (The FAIR team of Meta AI 에서 제공)은 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..의 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX)논문과 함께 발표했습니다.
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (Microsoft Research & University of Wisconsin-Madison 에서 제공)은 Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.의 [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485)논문과 함께 발표했습니다.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (AllenAI 에서) Iz Beltagy, Matthew E. Peters, Arman Cohan 의 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 논문과 함께 발표했습니다.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (Google AI 에서) Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 의 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 논문과 함께 발표했습니다.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (Studio Ousia 에서) Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 의 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 논문과 함께 발표했습니다.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (UNC Chapel Hill 에서) Hao Tan and Mohit Bansal 의 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 논문과 함께 발표했습니다.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (Facebook 에서) Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 의 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 논문과 함께 발표했습니다.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (Facebook 에서) Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 의 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 논문과 함께 발표했습니다.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (Microsoft Research Asia 에서) Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 의 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 논문과 함께 발표했습니다.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (FAIR and UIUC 에서 제공)은 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.의 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)논문과 함께 발표했습니다.
@@ -339,6 +341,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (NVIDIA 에서) Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 의 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 논문과 함께 발표했습니다.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (Alibaba Research 에서 제공)은 Peng Wang, Cheng Da, and Cong Yao.의 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592)논문과 함께 발표했습니다.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (Studio Ousia 에서) Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 의 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 논문과 함께 발표했습니다.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (Facebook 에서 제공)은 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.의 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516)논문과 함께 발표했습니다.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (CMU/Google Brain 에서) Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 의 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 논문과 함께 발표했습니다.
@@ -363,6 +366,8 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI 에서 제공)은 Matthias Minderer, Alexey Gritsenko, Neil Houlsby.의 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)논문과 함께 발표했습니다.
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** ( IBM Research 에서 제공)은 Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf)논문과 함께 발표했습니다.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (IBM 에서 제공)은 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.의 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf)논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
@@ -388,6 +393,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
@@ -413,14 +419,17 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (Google/CMU 에서) Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 의 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 논문과 함께 발표했습니다.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (Microsoft 에서) Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 의 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 논문과 함께 발표했습니다.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill 에서) Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 의 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 논문과 함께 발표했습니다.
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (Intel 에서) Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding 의 [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) 논문과 함께 발표했습니다.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (Google Research 에서) Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzle 의 [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) 논문과 함께 발표했습니다.
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (Google Research 에서 제공)은 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.의 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi)논문과 함께 발표했습니다.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (Microsoft Research 에서) Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 의 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 논문과 함께 발표했습니다.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (Microsoft Research 에서) Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 의 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 논문과 함께 발표했습니다.
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (Peking University 에서 제공)은 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.의 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221)논문과 함께 발표했습니다.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (Tsinghua University and Nankai University 에서) Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 의 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 논문과 함께 발표했습니다.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (Multimedia Computing Group, Nanjing University 에서) Zhan Tong, Yibing Song, Jue Wang, Limin Wang 의 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 논문과 함께 발표했습니다.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (NAVER AI Lab/Kakao Enterprise/Kakao Brain 에서) Wonjae Kim, Bokyung Son, Ildoo Kim 의 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 논문과 함께 발표했습니다.
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (University of Wisconsin–Madison 에서 제공)은 Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.의 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784)논문과 함께 발표했습니다.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (UCLA NLP 에서) Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 의 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 논문과 함께 발표했습니다.
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (Google AI 에서) Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 의 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 논문과 함께 발표했습니다.
diff --git a/README_pt-br.md b/README_pt-br.md
index 0e5f638bc5f6cc..0c8e8d09e3591a 100644
--- a/README_pt-br.md
+++ b/README_pt-br.md
@@ -409,6 +409,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
diff --git a/README_ru.md b/README_ru.md
index cf63f7c67ef9ec..bc32f2f0b44cf1 100644
--- a/README_ru.md
+++ b/README_ru.md
@@ -399,6 +399,7 @@ conda install -c huggingface transformers
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
diff --git a/README_te.md b/README_te.md
index 1b6a1812fa0444..829e27690f9683 100644
--- a/README_te.md
+++ b/README_te.md
@@ -402,6 +402,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
diff --git a/README_zh-hans.md b/README_zh-hans.md
index 3d84374d5561d5..ef22939374c95a 100644
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -345,12 +345,14 @@ conda install -c huggingface transformers
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (来自 South China University of Technology) 伴随论文 [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) 由 Jiapeng Wang, Lianwen Jin, Kai Ding 发布。
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (来自 The FAIR team of Meta AI) 伴随论文 [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) 由 Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample 发布。
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (来自 The FAIR team of Meta AI) 伴随论文 [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) 由 Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom. 发布。
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (来自 Microsoft Research & University of Wisconsin-Madison) 伴随论文 [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) 由 Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee 发布。
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (来自 Google AI) released 伴随论文 [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) 由 Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang 发布。
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (来自 Facebook) 伴随论文 [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) 由 Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert 发布。
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (来自 Microsoft Research Asia) 伴随论文 [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) 由 Junlong Li, Yiheng Xu, Lei Cui, Furu Wei 发布。
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (来自 FAIR and UIUC) 伴随论文 [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) 由 Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar 发布。
@@ -363,6 +365,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (来自 Alibaba Research) 伴随论文 [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) 由 Peng Wang, Cheng Da, and Cong Yao 发布。
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (来自 Studio Ousia) 伴随论文 [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) 由 Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka 发布。
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (来自 Facebook) 伴随论文 [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) 由 Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli 发布。
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (来自 CMU/Google Brain) 伴随论文 [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) 由 Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou 发布。
@@ -387,6 +390,8 @@ conda install -c huggingface transformers
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (来自 Google AI) 伴随论文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) 由 Matthias Minderer, Alexey Gritsenko, Neil Houlsby 发布。
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (来自  IBM Research) 伴随论文 [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) 由 Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (来自 IBM) 伴随论文 [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) 由 Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
@@ -412,6 +417,7 @@ conda install -c huggingface transformers
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
@@ -437,14 +443,17 @@ conda install -c huggingface transformers
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (来自 UNC Chapel Hill) 伴随论文 [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) 由 Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal 发布。
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (来自 Intel) 伴随论文 [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) 由 Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding 发布.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (来自 Google Research) 伴随论文 [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) 由 Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant 发布。
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (来自 Peking University) 伴随论文 [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) 由 Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun 发布。
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (来自 Tsinghua University and Nankai University) 伴随论文 [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) 由 Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu 发布。
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (来自 Multimedia Computing Group, Nanjing University) 伴随论文 [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) 由 Zhan Tong, Yibing Song, Jue Wang, Limin Wang 发布。
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (来自 NAVER AI Lab/Kakao Enterprise/Kakao Brain) 伴随论文 [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) 由 Wonjae Kim, Bokyung Son, Ildoo Kim 发布。
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (来自 University of Wisconsin–Madison) 伴随论文 [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) 由 Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
diff --git a/README_zh-hant.md b/README_zh-hant.md
index c095423cce15dd..53fa729020797c 100644
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -357,12 +357,14 @@ conda install -c huggingface transformers
 1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
 1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
 1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/XXX) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom..
+1. **[LLaVa](https://huggingface.co/docs/transformers/main/model_doc/llava)** (from Microsoft Research & University of Wisconsin-Madison) released with the paper [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
 1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
 1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
 1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MADLAD-400](https://huggingface.co/docs/transformers/model_doc/madlad-400)** (from Google) released with the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](https://arxiv.org/abs/2309.04662) by Sneha Kudugunta, Isaac Caswell, Biao Zhang, Xavier Garcia, Christopher A. Choquette-Choo, Katherine Lee, Derrick Xin, Aditya Kusupati, Romi Stella, Ankur Bapna, Orhan Firat.
 1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
 1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
@@ -375,6 +377,7 @@ conda install -c huggingface transformers
 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The Mistral AI team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.. 
+1. **[Mixtral](https://huggingface.co/docs/transformers/main/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 
 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
@@ -399,6 +402,8 @@ conda install -c huggingface transformers
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[PatchTSMixer](https://huggingface.co/docs/transformers/main/model_doc/patchtsmixer)** (from  IBM Research) released with the paper [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
+1. **[PatchTST](https://huggingface.co/docs/transformers/main/model_doc/patchtst)** (from IBM) released with the paper [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/pdf/2211.14730.pdf) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong, Jayant Kalagnanam.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@@ -424,6 +429,7 @@ conda install -c huggingface transformers
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
 1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SeamlessM4Tv2](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t_v2)** (from Meta AI) released with the paper [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@@ -449,14 +455,17 @@ conda install -c huggingface transformers
 1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[TVP](https://huggingface.co/docs/transformers/model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
 1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
 1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UnivNet](https://huggingface.co/docs/transformers/main/model_doc/univnet)** (from Kakao Corporation) released with the paper [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kim, and Juntae Kim. 
 1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
 1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/pdf/2202.09741.pdf) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
 1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
 1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[VipLlava](https://huggingface.co/docs/transformers/main/model_doc/vipllava)** (from University of Wisconsin–Madison) released with the paper [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile
index d108ba5ace5805..7ab236a55d5902 100644
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@@ -56,7 +56,7 @@ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://hu
 RUN python3 -m pip install --no-cache-dir einops
 
 # Add autoawq for quantization testing
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.6/autoawq-0.1.6+cu118-cp38-cp38-linux_x86_64.whl
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.1.7/autoawq-0.1.7+cu118-cp38-cp38-linux_x86_64.whl
 
 # For bettertransformer + gptq 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
diff --git a/docker/transformers-pytorch-amd-gpu/Dockerfile b/docker/transformers-pytorch-amd-gpu/Dockerfile
index f19cd4edb0e4f8..46ca1a531b4ab4 100644
--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@@ -1,27 +1,32 @@
-FROM rocm/pytorch:rocm5.6_ubuntu20.04_py3.8_pytorch_2.0.1
+FROM rocm/dev-ubuntu-20.04:5.6
+# rocm/pytorch has no version with 2.1.0
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
+ARG PYTORCH='2.1.0'
+ARG TORCH_VISION='0.16.0'
+ARG TORCH_AUDIO='2.1.0'
+ARG ROCM='5.6'
+
 RUN apt update && \
-    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg && \
+    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip ffmpeg && \
     apt clean && \
     rm -rf /var/lib/apt/lists/*
 
-RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
+RUN python3 -m pip install --no-cache-dir --upgrade pip
 
-# If set to nothing, will install the latest version
-ARG PYTORCH='2.0.1'
-ARG TORCH_VISION='0.15.2'
-ARG TORCH_AUDIO='2.0.2'
-ARG ROCM='5.6'
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM
 
-RUN git clone --depth 1 --branch v$TORCH_AUDIO https://github.com/pytorch/audio.git
-RUN cd audio && USE_ROCM=1 USE_CUDA=0 python setup.py install
+RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
 
 ARG REF=main
 WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
 
 RUN python3 -m pip uninstall -y tensorflow flax
diff --git a/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
new file mode 100644
index 00000000000000..1fa384dfa2bc03
--- /dev/null
+++ b/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile
@@ -0,0 +1,45 @@
+FROM rocm/dev-ubuntu-22.04:5.6
+LABEL maintainer="Hugging Face"
+
+ARG DEBIAN_FRONTEND=noninteractive
+ARG PYTORCH='2.1.1'
+ARG TORCH_VISION='0.16.1'
+ARG TORCH_AUDIO='2.1.1'
+ARG ROCM='5.6'
+
+RUN apt update && \
+    apt install -y --no-install-recommends \
+    libaio-dev \
+    git \
+    # These are required to build deepspeed.
+    python3-dev \
+    python-is-python3 \
+    rocrand-dev \
+    rocthrust-dev \
+    hipsparse-dev \
+    hipblas-dev \
+    rocblas-dev && \
+    apt clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic<2"
+RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
+RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
+
+# Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
+
+ARG REF=main
+WORKDIR /
+
+# Invalidate docker cache from here if new commit is available.
+ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
+RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
+
+RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
+
+# When installing in editable mode, `transformers` is not recognized as a package.
+# this line must be added in order for python to be aware of transformers.
+RUN cd transformers && python3 setup.py develop
+
+RUN python3 -c "from deepspeed.launcher.runner import main"
\ No newline at end of file
diff --git a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
index 276f35f3351846..a7b08a8c60d31d 100644
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@@ -1,12 +1,12 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
+FROM nvcr.io/nvidia/pytorch:23.11-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 ARG PYTORCH='2.1.0'
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
@@ -34,7 +34,7 @@ RUN python3 -m pip uninstall -y torch-tensorrt
 
 # recompile apex
 RUN python3 -m pip uninstall -y apex
-RUN git clone https://github.com/NVIDIA/apex
+# RUN git clone https://github.com/NVIDIA/apex
 #  `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
 # TODO: check if there is alternative way to install latest apex
 # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
@@ -44,7 +44,7 @@ RUN python3 -m pip uninstall -y deepspeed
 # This has to be run (again) inside the GPU VMs running the tests.
 # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
 # TODO: Find out why test fail.
-RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
+RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
 
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
diff --git a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
index b3ead0c615471f..06da67049296a5 100644
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@@ -1,11 +1,11 @@
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-22-12.html#rel-22-12
-FROM nvcr.io/nvidia/pytorch:22.12-py3
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
+FROM nvcr.io/nvidia/pytorch:23.11-py3
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
 
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'
 
 RUN apt -y update
 RUN apt install -y libaio-dev
diff --git a/docker/transformers-pytorch-gpu/Dockerfile b/docker/transformers-pytorch-gpu/Dockerfile
index 702a837abd0192..44f609589419f2 100644
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
 LABEL maintainer="Hugging Face"
 
 ARG DEBIAN_FRONTEND=noninteractive
@@ -15,7 +15,7 @@ ARG PYTORCH='2.1.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
-ARG CUDA='cu118'
+ARG CUDA='cu121'
 
 RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' ||  VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
 RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' ||  VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
diff --git a/docs/source/de/preprocessing.md b/docs/source/de/preprocessing.md
index 1e8f6ff4062aea..9c977e10a538a3 100644
--- a/docs/source/de/preprocessing.md
+++ b/docs/source/de/preprocessing.md
@@ -209,7 +209,7 @@ Audioeingaben werden anders vorverarbeitet als Texteingaben, aber das Endziel bl
 pip install datasets
 ```
 
-Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html)):
+Laden Sie den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz (weitere Informationen zum Laden eines Datensatzes finden Sie im 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub)):
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -344,7 +344,7 @@ Laden wir den [food101](https://huggingface.co/datasets/food101) Datensatz für
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) an:
+Als Nächstes sehen Sie sich das Bild mit dem Merkmal 🤗 Datensätze [Bild] (https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) an:
 
 ```py
 >>> dataset[0]["image"]
@@ -385,7 +385,7 @@ Bei Bildverarbeitungsaufgaben ist es üblich, den Bildern als Teil der Vorverarb
 ...     return examples
 ```
 
-3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform), um die Transformationen im laufenden Betrieb anzuwenden:
+3. Dann verwenden Sie 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process#format-transform), um die Transformationen im laufenden Betrieb anzuwenden:
 
 ```py
 >>> dataset.set_transform(transforms)
diff --git a/docs/source/de/quicktour.md b/docs/source/de/quicktour.md
index 139869e5d1eeb3..2b66d2d6a917e9 100644
--- a/docs/source/de/quicktour.md
+++ b/docs/source/de/quicktour.md
@@ -121,7 +121,7 @@ Erstellen wir eine [`pipeline`] mit der Aufgabe die wir lösen und dem Modell we
 >>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 ```
 
-Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
+Als nächstes laden wir den Datensatz (siehe 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart) für mehr Details) welches wir nutzen möchten. Zum Beispiel laden wir den [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) Datensatz:
 
 ```py
 >>> from datasets import load_dataset, Audio
diff --git a/docs/source/de/run_scripts.md b/docs/source/de/run_scripts.md
index 2902d4c084144e..4afe72dae6d662 100644
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@@ -130,7 +130,7 @@ Der [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) unt
 - Legen Sie die Anzahl der zu verwendenden GPUs mit dem Argument `nproc_per_node` fest.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/de/training.md b/docs/source/de/training.md
index 493de3052bbf19..b1b7c14f261a72 100644
--- a/docs/source/de/training.md
+++ b/docs/source/de/training.md
@@ -43,7 +43,7 @@ Laden Sie zunächst den Datensatz [Yelp Reviews](https://huggingface.co/datasets
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
 ```
 
-Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden:
+Wie Sie nun wissen, benötigen Sie einen Tokenizer, um den Text zu verarbeiten und eine Auffüll- und Abschneidungsstrategie einzubauen, um mit variablen Sequenzlängen umzugehen. Um Ihren Datensatz in einem Schritt zu verarbeiten, verwenden Sie die 🤗 Methode Datasets [`map`](https://huggingface.co/docs/datasets/process#map), um eine Vorverarbeitungsfunktion auf den gesamten Datensatz anzuwenden:
 
 ```py
 >>> from transformers import AutoTokenizer
diff --git a/docs/source/en/_redirects.yml b/docs/source/en/_redirects.yml
index 0dd4d2bfb34b2f..b6575a6b02f205 100644
--- a/docs/source/en/_redirects.yml
+++ b/docs/source/en/_redirects.yml
@@ -1,3 +1,3 @@
 # Optimizing inference
 
-perf_infer_gpu_many: perf_infer_gpu_one
\ No newline at end of file
+perf_infer_gpu_many: perf_infer_gpu_one
diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml
index 4e0ce88c10af31..09210a471e3acd 100644
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@@ -60,7 +60,7 @@
     - local: tasks/image_classification
       title: Image classification
     - local: tasks/semantic_segmentation
-      title: Semantic segmentation
+      title: Image segmentation
     - local: tasks/video_classification
       title: Video classification
     - local: tasks/object_detection
@@ -133,6 +133,8 @@
 - sections:
   - local: performance
     title: Overview
+  - local: quantization
+    title: Quantization
   - sections:
     - local: perf_train_gpu_one
       title: Methods and tools for efficient training on a single GPU
@@ -216,6 +218,8 @@
       title: Agents and Tools
     - local: model_doc/auto
       title: Auto Classes
+    - local: main_classes/backbones
+      title: Backbones
     - local: main_classes/callback
       title: Callbacks
     - local: main_classes/configuration
@@ -378,6 +382,8 @@
         title: LUKE
       - local: model_doc/m2m_100
         title: M2M100
+      - local: model_doc/madlad-400
+        title: MADLAD-400
       - local: model_doc/marian
         title: MarianMT
       - local: model_doc/markuplm
@@ -392,6 +398,8 @@
         title: MegatronGPT2
       - local: model_doc/mistral
         title: Mistral
+      - local: model_doc/mixtral
+        title: Mixtral
       - local: model_doc/mluke
         title: mLUKE
       - local: model_doc/mobilebert
@@ -614,6 +622,8 @@
         title: Pop2Piano
       - local: model_doc/seamless_m4t
         title: Seamless-M4T
+      - local: model_doc/seamless_m4t_v2
+        title: SeamlessM4T-v2
       - local: model_doc/sew
         title: SEW
       - local: model_doc/sew-d
@@ -628,6 +638,8 @@
         title: UniSpeech
       - local: model_doc/unispeech-sat
         title: UniSpeech-SAT
+      - local: model_doc/univnet
+        title: UnivNet
       - local: model_doc/vits
         title: VITS
       - local: model_doc/wav2vec2
@@ -695,6 +707,8 @@
         title: LayoutXLM
       - local: model_doc/lilt
         title: LiLT
+      - local: model_doc/llava
+        title: Llava
       - local: model_doc/lxmert
         title: LXMERT
       - local: model_doc/matcha
@@ -723,8 +737,12 @@
         title: TrOCR
       - local: model_doc/tvlt
         title: TVLT
+      - local: model_doc/tvp
+        title: TVP
       - local: model_doc/vilt
         title: ViLT
+      - local: model_doc/vipllava
+        title: VipLlava
       - local: model_doc/vision-encoder-decoder
         title: Vision Encoder Decoder Models
       - local: model_doc/vision-text-dual-encoder
@@ -747,6 +765,10 @@
         title: Autoformer
       - local: model_doc/informer
         title: Informer
+      - local: model_doc/patchtsmixer
+        title: PatchTSMixer
+      - local: model_doc/patchtst
+        title: PatchTST
       - local: model_doc/time_series_transformer
         title: Time Series Transformer
       title: Time series models
diff --git a/docs/source/en/autoclass_tutorial.md b/docs/source/en/autoclass_tutorial.md
index 833882aa713e3d..876f9d897afd47 100644
--- a/docs/source/en/autoclass_tutorial.md
+++ b/docs/source/en/autoclass_tutorial.md
@@ -31,6 +31,7 @@ In this tutorial, learn to:
 * Load a pretrained feature extractor.
 * Load a pretrained processor.
 * Load a pretrained model.
+* Load a model as a backbone.
 
 ## AutoTokenizer
 
@@ -95,7 +96,7 @@ Load a processor with [`AutoProcessor.from_pretrained`]:
 
 <frameworkcontent>
 <pt>
-Finally, the `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
+The `AutoModelFor` classes let you load a pretrained model for a given task (see [here](model_doc/auto) for a complete list of available tasks). For example, load a model for sequence classification with [`AutoModelForSequenceClassification.from_pretrained`]:
 
 ```py
 >>> from transformers import AutoModelForSequenceClassification
@@ -141,3 +142,24 @@ Easily reuse the same checkpoint to load an architecture for a different task:
 Generally, we recommend using the `AutoTokenizer` class and the `TFAutoModelFor` class to load pretrained instances of models. This will ensure you load the correct architecture every time. In the next [tutorial](preprocessing), learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning.
 </tf>
 </frameworkcontent>
+
+## AutoBackbone
+
+`AutoBackbone` lets you use pretrained models as backbones and get feature maps as outputs from different stages of the models. Below you can see how to get feature maps from a [Swin](model_doc/swin) checkpoint.
+
+```py
+>>> from transformers import AutoImageProcessor, AutoBackbone
+>>> import torch
+>>> from PIL import Image
+>>> import requests
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,))
+
+>>> inputs = processor(image, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> feature_maps = outputs.feature_maps
+>>> list(feature_maps[-1].shape)
+[1, 96, 56, 56]
+```
diff --git a/docs/source/en/chat_templating.md b/docs/source/en/chat_templating.md
index 82bdf591ae5f3c..a478c32e6ff393 100644
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@@ -376,7 +376,10 @@ input formats. Our default template for models that don't have a class-specific
 ```
 
 If you like this one, here it is in one-liner form, ready to copy into your code. The one-liner also includes
-handy support for "generation prompts" - see the next section for more!
+handy support for [generation prompts](#what-are-generation-prompts), but note that it doesn't add BOS or EOS tokens!
+If your model expects those, they won't be added automatically by `apply_chat_template` - in other words, the
+text will be tokenized with `add_special_tokens=False`. This is to avoid potential conflicts between the template and
+the `add_special_tokens` logic. If your model expects special tokens, make sure to add them to the template!
 
 ```
 tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
diff --git a/docs/source/en/custom_models.md b/docs/source/en/custom_models.md
index 4abc3ce5773a1f..22ba58b9d9ddc4 100644
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@@ -14,7 +14,7 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Sharing custom models
+# Building custom models
 
 The 🤗 Transformers library is designed to be easily extensible. Every model is fully coded in a given subfolder
 of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs.
@@ -22,7 +22,8 @@ of the repository with no abstraction, so you can easily copy a modeling file an
 If you are writing a brand new model, it might be easier to start from scratch. In this tutorial, we will show you
 how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it
 with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗
-Transformers library.
+Transformers library. We'll see how to build upon transformers and extend the framework with your hooks and
+custom code.
 
 We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the
 [timm library](https://github.com/rwightman/pytorch-image-models) into a [`PreTrainedModel`].
@@ -218,6 +219,27 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 Now let's see how to make sure that when we do [`~PreTrainedModel.save_pretrained`] or [`~PreTrainedModel.push_to_hub`], the
 code of the model is saved.
 
+## Registering a model with custom code to the auto classes
+
+If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
+model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
+get the custom models (contrarily to automatically downloading the model code from the Hub).
+
+As long as your config has a `model_type` attribute that is different from existing model types, and that your model
+classes have the right `config_class` attributes, you can just add them to the auto classes like this:
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
+of your custom config, and the first argument used when registering your custom models to any auto model class needs
+to match the `config_class` of those models.
+
 ## Sending the code to the Hub
 
 <Tip warning={true}>
@@ -350,23 +372,3 @@ model = AutoModelForImageClassification.from_pretrained(
 Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit
 hash of any commit.
 
-## Registering a model with custom code to the auto classes
-
-If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own
-model. This is different from pushing the code to the Hub in the sense that users will need to import your library to
-get the custom models (contrarily to automatically downloading the model code from the Hub).
-
-As long as your config has a `model_type` attribute that is different from existing model types, and that your model
-classes have the right `config_class` attributes, you can just add them to the auto classes like this:
-
-```py
-from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
-
-AutoConfig.register("resnet", ResnetConfig)
-AutoModel.register(ResnetConfig, ResnetModel)
-AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
-```
-
-Note that the first argument used when registering your custom config to [`AutoConfig`] needs to match the `model_type`
-of your custom config, and the first argument used when registering your custom models to any auto model class needs
-to match the `config_class` of those models.
diff --git a/docs/source/en/index.md b/docs/source/en/index.md
index ae01569e970ce3..f63922d7f854a0 100644
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@@ -1,4 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+        <!--Copyright 2020 The HuggingFace Team. All rights reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -94,7 +94,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
 |                          [CLVP](model_doc/clvp)                          |       ✅        |         ❌         |      ❌      |
 |                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
-|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ❌      |
+|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ✅      |
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
@@ -167,14 +167,16 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
 |                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
 |                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
-|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ❌      |
-|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ❌      |
+|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ✅      |
+|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ✅      |
+|                         [LLaVa](model_doc/llava)                         |       ✅        |         ❌         |      ❌      |
 |                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
 |                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
 |                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
 |                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
 |                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
 |                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
+|                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
 |                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
 |                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
 |                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
@@ -187,6 +189,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
 |                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
 |                       [Mistral](model_doc/mistral)                       |       ✅        |         ❌         |      ❌      |
+|                       [Mixtral](model_doc/mixtral)                       |       ✅        |         ❌         |      ❌      |
 |                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
 |                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
 |                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
@@ -213,6 +216,8 @@ Flax), PyTorch, and/or TensorFlow.
 |                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
 |                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
 |                         [OWLv2](model_doc/owlv2)                         |       ✅        |         ❌         |      ❌      |
+|                  [PatchTSMixer](model_doc/patchtsmixer)                  |       ✅        |         ❌         |      ❌      |
+|                      [PatchTST](model_doc/patchtst)                      |       ✅        |         ❌         |      ❌      |
 |                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
 |                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
 |                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
@@ -240,6 +245,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
 |                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
 |                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
+|                [SeamlessM4Tv2](model_doc/seamless_m4t_v2)                |       ✅        |         ❌         |      ❌      |
 |                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
 |                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
 |                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
@@ -264,14 +270,17 @@ Flax), PyTorch, and/or TensorFlow.
 |                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
 |                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
 |                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
+|                           [TVP](model_doc/tvp)                           |       ✅        |         ❌         |      ❌      |
 |                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
 |                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
 |                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
 |                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
+|                       [UnivNet](model_doc/univnet)                       |       ✅        |         ❌         |      ❌      |
 |                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
 |                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
 |                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
 |                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
+|                      [VipLlava](model_doc/vipllava)                      |       ✅        |         ❌         |      ❌      |
 |        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
 |       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
 |                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/en/internal/generation_utils.md b/docs/source/en/internal/generation_utils.md
index 906ee4ea620b2a..40969f227e9dbe 100644
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@@ -368,3 +368,20 @@ A [`Constraint`] can be used to force the generation to include specific tokens
 [[autodoc]] TextStreamer
 
 [[autodoc]] TextIteratorStreamer
+
+## Caches
+
+[[autodoc]] Cache
+    - update
+
+[[autodoc]] DynamicCache
+    - update
+    - get_seq_length
+    - reorder_cache
+    - to_legacy_cache
+    - from_legacy_cache
+
+[[autodoc]] SinkCache
+    - update
+    - get_seq_length
+    - reorder_cache
diff --git a/docs/source/en/internal/trainer_utils.md b/docs/source/en/internal/trainer_utils.md
index e3f8a9b04536fa..1bc5e2baae2d6f 100644
--- a/docs/source/en/internal/trainer_utils.md
+++ b/docs/source/en/internal/trainer_utils.md
@@ -40,7 +40,7 @@ Most of those are only useful if you are studying the code of the Trainer in the
 
 [[autodoc]] trainer_pt_utils.DistributedTensorGatherer
 
-## Distributed Evaluation
+## Trainer Argument Parser
 
 [[autodoc]] HfArgumentParser
 
diff --git a/docs/source/en/llm_tutorial.md b/docs/source/en/llm_tutorial.md
index bb42fb4633bf69..8d6372e129cc47 100644
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@@ -250,7 +250,7 @@ While the autoregressive generation process is relatively straightforward, makin
 1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
 2. [Guide](chat_templating) on the prompt template for chat LLMs;
 3. [Guide](tasks/prompting) on to get the most of prompt design;
-4. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
+4. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils). Most of the classes, including the logits processors, have usage examples!
 
 ### LLM leaderboards
 
diff --git a/docs/source/en/llm_tutorial_optimization.md b/docs/source/en/llm_tutorial_optimization.md
index 497e624820d4ed..93848d72b0d811 100644
--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@@ -22,7 +22,7 @@ The crux of these challenges lies in augmenting the computational and memory cap
 
 In this guide, we will go over the effective techniques for efficient LLM deployment:
 
-1.  **Lower Precision**: Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
+1.  **Lower Precision:** Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
 
 2.  **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.
 
@@ -58,7 +58,7 @@ As of writing this document, the largest GPU chip on the market is the A100 & H1
 🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
 
 Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
-Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/v4.34.0/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
+Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
 
 If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows
 
@@ -286,7 +286,7 @@ If GPU memory is not a constraint for your use case, there is often no need to l
 For more in-detail usage information, we strongly recommend taking a look at the [Transformers Quantization Docs](https://huggingface.co/docs/transformers/main_classes/quantization#general-usage).
 Next, let's look into how we can improve computational and memory efficiency by using better algorithms and an improved model architecture.
 
-# 2. Flash Attention
+## 2. Flash Attention
 
 Today's top-performing LLMs share more or less the same fundamental architecture that consists of feed-forward layers, activation layers, layer normalization layers, and most crucially, self-attention layers.
 
@@ -441,7 +441,7 @@ flush()
 ```
 
 For comparison, let's run the same function, but enable Flash Attention instead.
-To do so, we convert the model to [BetterTransformers](https://huggingface.co/docs/optimum/bettertransformer/overview) and by doing so enabling PyTorch's [SDPA self-attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) which in turn is based on Flash Attention.
+To do so, we convert the model to [BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) and by doing so enabling PyTorch's [SDPA self-attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) which in turn is able to use Flash Attention.
 
 ```python
 model.to_bettertransformer()
@@ -484,7 +484,9 @@ We can observe that we only use roughly 100MB more GPU memory when passing a ver
 ```py
 flush()
 ```
-For more information on how to use Flash Attention, please have a look at [this doc page](https://huggingface.co/docs/transformers/v4.34.0/en/perf_infer_gpu_one#flash-attention-2).
+
+For more information on how to use Flash Attention, please have a look at [this doc page](https://huggingface.co/docs/transformers/en/perf_infer_gpu_one#flashattention-2).
+
 ## 3. Architectural Innovations
 
 So far we have looked into improving computational and memory efficiency by:
@@ -662,7 +664,15 @@ Using the key-value cache has two advantages:
 
 > One should *always* make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. Transformers has the key-value cache enabled by default when making use of the text pipeline or the [`generate` method](https://huggingface.co/docs/transformers/main_classes/text_generation).
 
-Note that the key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
+<Tip warning={true}>
+
+Note that, despite our advice to use key-value caches, your LLM output may be slightly different when you use them. This is a property of the matrix multiplication kernels themselves -- you can read more about it [here](https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
+
+</Tip>
+
+#### 3.2.1 Multi-round conversation
+
+The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
 
 ```
 User: How many people live in France?
@@ -672,14 +682,45 @@ Assistant: Germany has ca. 81 million inhabitants
 ```
 
 In this chat, the LLM runs auto-regressive decoding twice:
-- 1. The first time, the key-value cache is empty and the input prompt is `"User: How many people live in France?"` and the model auto-regressively generates the text `"Roughly 75 million people live in France"` while increasing the key-value cache at every decoding step.
-- 2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
+  1. The first time, the key-value cache is empty and the input prompt is `"User: How many people live in France?"` and the model auto-regressively generates the text `"Roughly 75 million people live in France"` while increasing the key-value cache at every decoding step.
+  2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
 
 Two things should be noted here:
   1. Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. E.g. for the example above the LLM needs to understand that the user refers to the population when asking `"And how many are in Germany"`.
   2. The key-value cache is extremely useful for chat as it allows us to continuously grow the encoded chat history instead of having to re-encode the chat history again from scratch (as e.g. would be the case when using an encoder-decoder architecture).
 
-There is however one catch. While the required peak memory for the \\( \mathbf{QK}^T \\) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \\( \mathbf{x}_i \text{, for } i \in \{1, \ldots, c - 1\} \\) for all self-attention layers and for all attention heads.
+In `transformers`, a `generate` call will return `past_key_values` when `return_dict_in_generate=True` is passed, in addition to the default `use_cache=True`. Note that it is not yet available through the `pipeline` interface.
+
+```python
+# Generation as usual
+prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"
+model_inputs = tokenizer(prompt, return_tensors='pt')
+generation_output = model.generate(**model_inputs, max_new_tokens=60, return_dict_in_generate=True)
+decoded_output = tokenizer.batch_decode(generation_output.sequences)[0]
+
+# Piping the returned `past_key_values` to speed up the next conversation round
+prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here"
+model_inputs = tokenizer(prompt, return_tensors='pt')
+generation_output = model.generate(
+  **model_inputs,
+  past_key_values=generation_output.past_key_values,
+  max_new_tokens=60,
+  return_dict_in_generate=True
+)
+tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):]
+```
+
+**Output**:
+```
+ is a modified version of the function that returns Mega bytes instead.
+
+def bytes_to_megabytes(bytes):
+   return bytes / 1024 / 1024
+
+Answer: The function takes a number of bytes as input and returns the number of
+```
+
+Great, no additional time is spent recomputing the same key and values for the attention layer! There is however one catch. While the required peak memory for the \\( \mathbf{QK}^T \\) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \\( \mathbf{x}_i \text{, for } i \in \{1, \ldots, c - 1\} \\) for all self-attention layers and for all attention heads.
 
 Let's compute the number of float values that need to be stored in the key-value cache for the LLM `bigcode/octocoder` that we used before.
 The number of float values amounts to two times the sequence length times the number of attention heads times the attention head dimension and times the number of layers.
@@ -696,11 +737,11 @@ config = model.config
 ```
 
 Roughly 8 billion float values! Storing 8 billion float values in `float16` precision requires around 15 GB of RAM which is circa half as much as the model weights themselves!
-Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache:
+Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache, which are explored in the next subsections.
 
-  1.  [Multi-Query-Attention (MQA)](https://arxiv.org/abs/1911.02150)
+#### 3.2.2 Multi-Query-Attention (MQA)
 
-Multi-Query-Attention was proposed in Noam Shazeer's *Fast Transformer Decoding: One Write-Head is All You Need* paper. As the title says, Noam found out that instead of using `n_head` key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades.
+[Multi-Query-Attention](https://arxiv.org/abs/1911.02150) was proposed in Noam Shazeer's *Fast Transformer Decoding: One Write-Head is All You Need* paper. As the title says, Noam found out that instead of using `n_head` key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades.
 
 > By using a single head-value projection weight pair, the key value vectors \\( \mathbf{k}_i, \mathbf{v}_i \\) have to be identical across all attention heads which in turn means that we only need to store 1 key-value projection pair in the cache instead of `n_head` ones.
 
@@ -720,9 +761,9 @@ MQA has seen wide adoption by the community and is now used by many of the most
 
 Also, the checkpoint used in this notebook - `bigcode/octocoder` - makes use of MQA.
 
-  2.  [Grouped-Query-Attention (GQA)](https://arxiv.org/abs/2305.13245)
+#### 3.2.3 Grouped-Query-Attention (GQA)
 
-Grouped-Query-Attention, as proposed by Ainslie et al. from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. Instead of using just a single key-value projection weight, `n < n_head` key-value projection weights should be used. By choosing `n` to a significantly smaller value than `n_head`, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance.
+[Grouped-Query-Attention](https://arxiv.org/abs/2305.13245), as proposed by Ainslie et al. from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. Instead of using just a single key-value projection weight, `n < n_head` key-value projection weights should be used. By choosing `n` to a significantly smaller value than `n_head`, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance.
 
 Moreover, the authors of GQA found out that existing model checkpoints can be *uptrained* to have a GQA architecture with as little as 5% of the original pre-training compute. While 5% of the original pre-training compute can still be a massive amount, GQA *uptraining* allows existing checkpoints to be useful for longer input sequences.
 
@@ -731,6 +772,7 @@ The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-ll
 
 > As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat.
 
+
 ## Conclusion
 
 The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://arxiv.org/abs/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation).
diff --git a/docs/source/en/main_classes/backbones.md b/docs/source/en/main_classes/backbones.md
new file mode 100644
index 00000000000000..5c3aacd125d03b
--- /dev/null
+++ b/docs/source/en/main_classes/backbones.md
@@ -0,0 +1,93 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Backbones
+
+Backbones are models used for feature extraction for computer vision tasks. One can use a model as backbone in two ways:
+
+* initializing `AutoBackbone` class with a pretrained model,
+* initializing a supported backbone configuration and passing it to the model architecture. 
+
+## Using AutoBackbone 
+
+You can use `AutoBackbone` class to initialize a model as a backbone and get the feature maps for any stage. You can define `out_indices` to indicate the index of the layers which you would like to get the feature maps from. You can also use `out_features` if you know the name of the layers. You can use them interchangeably. If you are using both `out_indices` and `out_features`, ensure they are consistent. Not passing any of the feature map arguments will make the backbone yield the feature maps of the last layer.
+To visualize how stages look like, let's take the Swin model. Each stage is responsible from feature extraction, outputting feature maps.
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stages.png">
+</div>
+
+Illustrating feature maps of the first stage looks like below.
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png">
+</div>
+
+Let's see with an example. Note that `out_indices=(0,)` results in yielding the stem of the model. Stem refers to the stage before the first feature extraction stage. In above diagram, it refers to patch partition. We would like to have the feature maps from stem, first, and second stage of the model.
+```py
+>>> from transformers import AutoImageProcessor, AutoBackbone
+>>> import torch
+>>> from PIL import Image
+>>> import requests
+
+>>> processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
+>>> model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(0,1,2))
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(image, return_tensors="pt")
+>>> outputs = model(**inputs)
+>>> feature_maps = outputs.feature_maps
+```
+`feature_maps` object now has three feature maps, each can be accessed like below. Say we would like to get the feature map of the stem.
+```python
+>>> list(feature_maps[0].shape)
+[1, 96, 56, 56]
+```
+
+We can get the feature maps of first and second stages like below.
+```python
+>>> list(feature_maps[1].shape)
+[1, 96, 56, 56]
+>>> list(feature_maps[2].shape)
+[1, 192, 28, 28]
+```
+
+## Initializing Backbone Configuration
+
+In computer vision, models consist of backbone, neck, and a head. Backbone extracts the features, neck transforms the output of the backbone and head is used for the main task (e.g. object detection). You can initialize neck and head with model backbones by passing a model configuration to `backbone_config`. For example, below you can see how to initialize the [MaskFormer](../model_doc/maskformer) model with instance segmentation head with [ResNet](../model_doc/resnet) backbone.
+
+```py
+from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
+
+backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50")
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+You can also initialize a backbone with random weights to initialize the model neck with it. 
+
+```py
+backbone_config = ResNetConfig()
+config = MaskFormerConfig(backbone_config=backbone_config)
+model = MaskFormerForInstanceSegmentation(config)
+```
+
+`timm` models are also supported in transformers through `TimmBackbone` and `TimmBackboneConfig`.
+
+```python
+from transformers import TimmBackboneConfig, TimmBackbone
+
+backbone_config = TimmBackboneConfig("resnet50")
+model = TimmBackbone(config=backbone_config)
+```
diff --git a/docs/source/en/main_classes/deepspeed.md b/docs/source/en/main_classes/deepspeed.md
index 277610ce9cda53..8133f6c097c949 100644
--- a/docs/source/en/main_classes/deepspeed.md
+++ b/docs/source/en/main_classes/deepspeed.md
@@ -287,7 +287,7 @@ The information in this section isn't not specific to the DeepSpeed integration
 
 For the duration of this section let's assume that you have 2 nodes with 8 gpus each. And you can reach the first node with `ssh hostname1` and second node with `ssh hostname2`, and both must be able to reach each other via ssh locally without a password. Of course, you will need to rename these host (node) names to the actual host names you are working with.
 
-#### The torch.distributed.run launcher
+#### The torch.distributed.run(torchrun) launcher
 
 
 For example, to use `torch.distributed.run`, you could do:
diff --git a/docs/source/en/main_classes/quantization.md b/docs/source/en/main_classes/quantization.md
index 7200039e3f5058..271cf17412fbe6 100644
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@@ -14,535 +14,24 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Quantize 🤗 Transformers models
+# Quantization
 
-## AWQ integration
+Quantization techniques reduces memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Transformers supports the AWQ and GPTQ quantization algorithms and it supports 8-bit and 4-bit quantization with bitsandbytes.
 
-AWQ method has been introduced in the [*AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration* paper](https://arxiv.org/abs/2306.00978). With AWQ you can run models in 4-bit precision, while preserving its original quality (i.e. no performance degradation) with a superior throughput that other quantization methods presented below - reaching similar throughput as pure `float16` inference.
+<Tip>
 
-We now support inference with any AWQ model, meaning anyone can load and use AWQ weights that are pushed on the Hub or saved locally. Note that using AWQ requires to have access to a NVIDIA GPU. CPU inference is not supported yet. 
+Learn how to quantize models in the [Quantization](../quantization) guide.
 
-### Quantizing a model
-
-We advise users to look at different existing tools in the ecosystem to quantize their models with AWQ algorithm, such as:
-
-- [`llm-awq`](https://github.com/mit-han-lab/llm-awq) from MIT Han Lab
-- [`autoawq`](https://github.com/casper-hansen/AutoAWQ) from [`casper-hansen`](https://github.com/casper-hansen)
-- Intel neural compressor from Intel - through [`optimum-intel`](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc)
-
-Many other tools might exist in the ecosystem, please feel free to open a PR to add them to the list.
-Currently the integration with 🤗 Transformers is only available for models that have been quantized using `autoawq` library and `llm-awq`. Most of the models quantized with `auto-awq` can be found under [`TheBloke`](https://huggingface.co/TheBloke) namespace of 🤗 Hub, and to quantize models with `llm-awq` please refer to the [`convert_to_hf.py`](https://github.com/mit-han-lab/llm-awq/blob/main/examples/convert_to_hf.py) script in the examples folder of [`llm-awq`](https://github.com/mit-han-lab/llm-awq/).
-
-### Load a quantized model
-
-You can load a quantized model from the Hub using the `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model's configuration file (`configuration.json`). You can confirm that the model is quantized in the AWQ format by checking the field `quantization_config.quant_method` which should be set to `"awq"`. Note that loading the model will set other weights in `float16` by default for performance reasons. If you want to change that behavior, you can pass `torch_dtype` argument to `torch.float32` or `torch.bfloat16`. You can find in the sections below some example snippets and notebook.
-
-## Example usage
-
-First, you need to install [`autoawq`](https://github.com/casper-hansen/AutoAWQ) library
-
-```bash
-pip install autoawq
-```
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
-```
-
-In case you first load your model on CPU, make sure to move it to your GPU device before using 
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "TheBloke/zephyr-7B-alpha-AWQ"
-model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda:0")
-```
-
-### Combining AWQ and Flash Attention
-
-You can combine AWQ quantization with Flash Attention to get a model that is both quantized and faster. Simply load the model using `from_pretrained` and pass `use_flash_attention_2=True` argument.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", use_flash_attention_2=True, device_map="cuda:0")
-```
-
-### Benchmarks
-
-We performed some speed, throughput and latency benchmarks using [`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark) library. 
-
-Note at that time of writing this documentation section, the available quantization methods were: `awq`, `gptq` and `bitsandbytes`.
-
-The benchmark was run on a NVIDIA-A100 instance and the model used was [`TheBloke/Mistral-7B-v0.1-AWQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) for the AWQ model, [`TheBloke/Mistral-7B-v0.1-GPTQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) for the GPTQ model. We also benchmarked it against `bitsandbytes` quantization methods and native `float16` model. Some results are shown below:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png">
-</div>
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png">
-</div>
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png">
-</div>
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png">
-</div>
-
-You can find the full results together with packages versions in [this link](https://github.com/huggingface/optimum-benchmark/tree/main/examples/running-mistrals).
-
-From the results it appears that AWQ quantization method is the fastest quantization method for inference, text generation and among the lowest peak memory for text generation. However, AWQ seems to have the largest forward latency per batch size. 
-
-### Google colab demo
-
-Check out how to use this integration throughout this [Google Colab demo](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
-
-### AwqConfig
-
-[[autodoc]] AwqConfig
-
-## `AutoGPTQ` Integration
-
-🤗 Transformers has integrated `optimum` API to perform GPTQ quantization on language models. You can load and quantize your model in 8, 4, 3 or even 2 bits without a big drop of performance and faster inference speed! This is supported by most GPU hardwares.
-
-To learn more about the quantization model, check out: 
-- the [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) paper
-- the `optimum` [guide](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization) on GPTQ quantization
-- the [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) library used as the backend
-
-### Requirements
-
-You need to have the following requirements installed to run the code below: 
-
-- Install latest `AutoGPTQ` library
-`pip install auto-gptq`
-
-- Install latest `optimum` from source 
-`pip install git+https://github.com/huggingface/optimum.git`
-
-- Install latest `transformers` from source 
-`pip install git+https://github.com/huggingface/transformers.git`
-
-- Install latest `accelerate` library 
-`pip install --upgrade accelerate`
-
-Note that GPTQ integration supports for now only text models and you may encounter unexpected behaviour for vision, speech or multi-modal models.
-
-### Load and quantize a model
-
-GPTQ is a quantization method that requires weights calibration before using the quantized models. If you want to quantize transformers model from scratch, it might take some time before producing the quantized model (~5 min on a Google colab for `facebook/opt-350m` model). 
-
-Hence, there are two different scenarios where you want to use GPTQ-quantized models. The first use case would be to load models that has been already quantized by other users that are available on the Hub, the second use case would be to quantize your model from scratch and save it or push it on the Hub so that other users can also use it.
-
-#### GPTQ Configuration
-
-In order to load and quantize a model, you need to create a [`GPTQConfig`]. You need to pass the number of `bits`, a `dataset` in order to calibrate the quantization and the `tokenizer` of the model in order prepare the dataset.
-
-```python 
-model_id = "facebook/opt-125m"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
-```
-
-Note that you can pass your own dataset as a list of string. However, it is highly recommended to use the dataset from the GPTQ paper. 
-
-```python
-dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
-quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
-```
-
-#### Quantization
-
-You can quantize a model by using `from_pretrained` and setting the `quantization_config`. 
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
-
-```
-Note that you will need a GPU to quantize a model. We will put the model in the cpu and move the modules back and forth to the gpu in order to quantize them.
-
-If you want to maximize your gpus usage while using cpu offload, you can set `device_map = "auto"`.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
-```
-
-Note that disk offload is not supported. Furthermore, if you are out of memory because of the dataset, you may have to pass `max_memory` in `from_pretained`. Checkout this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) to learn more about `device_map` and `max_memory`.
-
-<Tip warning={true}>
-GPTQ quantization only works for text model for now. Futhermore, the quantization process can a lot of time depending on one's hardware (175B model = 4 gpu hours using NVIDIA A100). Please check on the hub if there is not a GPTQ quantized version of the model. If not, you can submit a demand on github. 
 </Tip>
 
-### Push quantized model to 🤗 Hub
-
-You can push the quantized model like any 🤗 model to Hub with `push_to_hub`. The quantization config will be saved and pushed along the model. 
-
-```python
-quantized_model.push_to_hub("opt-125m-gptq")
-tokenizer.push_to_hub("opt-125m-gptq")
-```
-
-If you want to save your quantized model on your local machine, you can also do it with `save_pretrained`: 
-
-```python
-quantized_model.save_pretrained("opt-125m-gptq")
-tokenizer.save_pretrained("opt-125m-gptq")
-```
-
-Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it.
-
-```python
-quantized_model.to("cpu")
-quantized_model.save_pretrained("opt-125m-gptq")
-```
-
-### Load a quantized model from the 🤗 Hub
-
-You can load a quantized model from the Hub by using `from_pretrained`.
-Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
-```
-
-If you want to load a model faster and without allocating more memory than needed, the `device_map` argument also works with quantized model. Make sure that you have `accelerate` library installed.
-
-```python
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
-```
-
-### Exllama kernels for faster inference
-
-For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `use_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. Also, you can perform CPU inference using Auto-GPTQ for Auto-GPTQ version > 0.4.2 by passing `device_map` = "cpu". For CPU inference, you have to pass `use_exllama = False` in the `GPTQConfig.`
+## AwqConfig
 
-```py
-import torch
-gptq_config = GPTQConfig(bits=4)
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
-```
-
-With the release of the exllamav2 kernels, you can get faster inference speed compared to the exllama kernels. You just need to pass `exllama_config={"version": 2}` in [`GPTQConfig`]:
-
-```py
-import torch
-gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
-model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
-```
-
-Note that only 4-bit models are supported for now. Furthermore, it is recommended to deactivate the exllama kernels if you are finetuning a quantized model with peft. 
-
-You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
-#### Fine-tune a quantized model 
-
-With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. 
-Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
-
-### Example demo
-
-Check out the Google Colab [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) to learn how to quantize your model with GPTQ and how finetune the quantized model with peft. 
+[[autodoc]] AwqConfig
 
-### GPTQConfig
+## GPTQConfig
 
 [[autodoc]] GPTQConfig
 
-
-## `bitsandbytes` Integration
-
-🤗 Transformers is closely integrated with most used modules on `bitsandbytes`. You can load your model in 8-bit precision with few lines of code.
-This is supported by most of the GPU hardwares since the `0.37.0` release of `bitsandbytes`.
-
-Learn more about the quantization method in the [LLM.int8()](https://arxiv.org/abs/2208.07339) paper, or the [blogpost](https://huggingface.co/blog/hf-bitsandbytes-integration) about the collaboration.
-
-Since its `0.39.0` release, you can load any model that supports `device_map` using 4-bit quantization, leveraging FP4 data type.
-
-If you want to quantize your own pytorch model, check out this [documentation](https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization) from 🤗 Accelerate library. 
-
-Here are the things you can do using `bitsandbytes` integration
-
-### General usage
-
-You can quantize a model by using the `load_in_8bit` or `load_in_4bit` argument when calling the [`~PreTrainedModel.from_pretrained`] method as long as your model supports loading with 🤗 Accelerate and contains `torch.nn.Linear` layers. This should work for any modality as well.
-
-```python
-from transformers import AutoModelForCausalLM
-
-model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True)
-model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True)
-```
-
-By default all other modules (e.g. `torch.nn.LayerNorm`) will be converted in `torch.float16`, but if you want to change their `dtype` you can overwrite the `torch_dtype` argument:
-
-```python
->>> import torch
->>> from transformers import AutoModelForCausalLM
-
->>> model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
->>> model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
-torch.float32
-```
-
-
-### FP4 quantization 
-
-#### Requirements
-
-Make sure that you have installed the requirements below before running any of the code snippets below.
-
-- Latest `bitsandbytes` library
-`pip install bitsandbytes>=0.39.0`
-
-- Install latest `accelerate`
-`pip install --upgrade accelerate`
-
-- Install latest `transformers`
-`pip install --upgrade transformers`
-
-#### Tips and best practices
-
-- **Advanced usage:** Refer to [this Google Colab notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) for advanced usage of 4-bit quantization with all the possible options.
-
-- **Faster inference with `batch_size=1` :** Since the `0.40.0` release of bitsandbytes, for `batch_size=1` you can benefit from fast inference. Check out [these release notes](https://github.com/TimDettmers/bitsandbytes/releases/tag/0.40.0) and make sure to have a version that is greater than `0.40.0` to benefit from this feature out of the box. 
-
-- **Training:** According to [QLoRA paper](https://arxiv.org/abs/2305.14314), for training 4-bit base models (e.g. using LoRA adapters) one should use `bnb_4bit_quant_type='nf4'`. 
-
-- **Inference:** For inference, `bnb_4bit_quant_type` does not have a huge impact on the performance. However for consistency with the model's weights, make sure you use the same `bnb_4bit_compute_dtype` and `torch_dtype` arguments.
-
-#### Load a large model in 4bit
-
-By using `load_in_4bit=True` when calling the `.from_pretrained` method, you can divide your memory use by 4 (roughly).
-
-```python
-# pip install transformers accelerate bitsandbytes
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "bigscience/bloom-1b7"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
-```
-
-<Tip warning={true}>
-
-Note that once a model has been loaded in 4-bit it is currently not possible to push the quantized weights on the Hub. Note also that you cannot train 4-bit weights as this is not supported yet. However you can use 4-bit models to train extra parameters, this will be covered in the next section.
-
-</Tip>
-
-### Load a large model in 8bit
-
-You can load a model by roughly halving the memory requirements by using `load_in_8bit=True` argument when calling `.from_pretrained` method
-
-
-```python
-# pip install transformers accelerate bitsandbytes
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model_id = "bigscience/bloom-1b7"
-
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
-```
-
-Then, use your model as you would usually use a [`PreTrainedModel`].
-
-You can check the memory footprint of your model with `get_memory_footprint` method.
-
-```python
-print(model.get_memory_footprint())
-```
-
-With this integration we were able to load large models on smaller devices and run them without any issue.  
-
-<Tip warning={true}>
-
-Note that once a model has been loaded in 8-bit it is currently not possible to push the quantized weights on the Hub except if you use the latest `transformers` and `bitsandbytes`. Note also that you cannot train 8-bit weights as this is not supported yet. However you can use 8-bit models to train extra parameters, this will be covered in the next section.
-Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
-
-</Tip>
-
-#### Advanced use cases
-
-Here we will cover some advanced use cases you can perform with FP4 quantization 
-
-##### Change the compute dtype
-
-The compute dtype is used to change the dtype that will be used during computation. For example, hidden states could be in `float32` but computation can be set to bf16 for speedups. By default, the compute dtype is set to `float32`.
-
-```python
-import torch
-from transformers import BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
-```
-
-##### Using NF4 (Normal Float 4) data type 
-
-You can also use the NF4 data type, which is a new 4bit datatype adapted for weights that have been initialized using a normal distribution. For that run:
-
-```python
-from transformers import BitsAndBytesConfig
-
-nf4_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-)
-
-model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
-```
-
-##### Use nested quantization for more memory efficient inference
-
-We also advise users to use the nested quantization technique. This saves more memory at no additional performance - from our empirical observations, this enables fine-tuning llama-13b model on an NVIDIA-T4 16GB with a sequence length of 1024, batch size of 1 and gradient accumulation steps of 4.
-
-```python
-from transformers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
-```
-
-
-### Push quantized models on the 🤗 Hub
-
-You can push a quantized model on the Hub by naively using `push_to_hub` method. This will first push the quantization configuration file, then push the quantized model weights.
-Make sure to use `bitsandbytes>0.37.2` (at this time of writing, we tested it on `bitsandbytes==0.38.0.post1`) to be able to use this feature. 
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
-
-model.push_to_hub("bloom-560m-8bit")
-```
-
-<Tip warning={true}>
-
-Pushing 8bit models on the Hub is strongely encouraged for large models. This will allow the community to benefit from the memory footprint reduction and loading for example large models on a Google Colab.
-
-</Tip>
-
-### Load a quantized model from the 🤗 Hub
-
-You can load a quantized model from the Hub by using `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model configuration object.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
-```
-
-Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
-Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.
-
-### Advanced use cases
-
-This section is intended to advanced users, that want to explore what it is possible to do beyond loading and running 8-bit models.
-
-#### Offload between `cpu` and `gpu`
-
-One of the advanced use case of this is being able to load a model and dispatch the weights between `CPU` and `GPU`. Note that the weights that will be dispatched on CPU **will not** be converted in 8-bit, thus kept in `float32`. This feature is intended for users that want to fit a very large model and dispatch the model between GPU and CPU.
-
-First, load a [`BitsAndBytesConfig`] from `transformers` and set the attribute `llm_int8_enable_fp32_cpu_offload` to `True`:
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
-```
-
-Let's say you want to load `bigscience/bloom-1b7` model, and you have just enough GPU RAM to fit the entire model except the `lm_head`. Therefore write a custom device_map as follows:
-
-```python
-device_map = {
-    "transformer.word_embeddings": 0,
-    "transformer.word_embeddings_layernorm": 0,
-    "lm_head": "cpu",
-    "transformer.h": 0,
-    "transformer.ln_f": 0,
-}
-```
-
-And load your model as follows:
-```python
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    "bigscience/bloom-1b7",
-    device_map=device_map,
-    quantization_config=quantization_config,
-)
-```
-
-And that's it! Enjoy your model!
-
-#### Play with `llm_int8_threshold`
-
-You can play with the `llm_int8_threshold` argument to change the threshold of the outliers. An "outlier" is a hidden state value that is greater than a certain threshold. 
-This corresponds to the outlier threshold for outlier detection as described in `LLM.int8()` paper. Any hidden states value that is above this threshold will be considered an outlier and the operation on those values will be done in fp16. Values are usually normally distributed, that is, most values are in the range [-3.5, 3.5], but there are some exceptional systematic outliers that are very differently distributed for large models. These outliers are often in the interval [-60, -6] or [6, 60]. Int8 quantization works well for values of magnitude ~5, but beyond that, there is a significant performance penalty. A good default threshold is 6, but a lower threshold might be needed for more unstable models (small models, fine-tuning).
-This argument can impact the inference speed of the model. We suggest to play with this parameter to find which one is the best for your use case.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-model_id = "bigscience/bloom-1b7"
-
-quantization_config = BitsAndBytesConfig(
-    llm_int8_threshold=10,
-)
-
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map=device_map,
-    quantization_config=quantization_config,
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-```
-
-#### Skip the conversion of some modules
-
-Some models has several modules that needs to be not converted in 8-bit to ensure stability. For example Jukebox model has several `lm_head` modules that should be skipped. Play with `llm_int8_skip_modules` 
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-model_id = "bigscience/bloom-1b7"
-
-quantization_config = BitsAndBytesConfig(
-    llm_int8_skip_modules=["lm_head"],
-)
-
-model_8bit = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    device_map=device_map,
-    quantization_config=quantization_config,
-)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-```
-
-#### Fine-tune a model that has been loaded in 8-bit
-
-With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been loaded in 8-bit. 
-This enables fine-tuning large models such as `flan-t5-large` or `facebook/opt-6.7b` in a single google Colab. Please have a look at [`peft`](https://github.com/huggingface/peft) library for more details.
-
-Note that you don't need to pass `device_map` when loading the model for training. It will automatically load your model on your GPU. You can also set the device map to a specific device if needed (e.g. `cuda:0`, `0`, `torch.device('cuda:0')`). Please note that `device_map=auto` should be used for inference only. 
-
-### BitsAndBytesConfig
+## BitsAndBytesConfig
 
 [[autodoc]] BitsAndBytesConfig
-
-
-## Quantization with 🤗 `optimum` 
-
-Please have a look at [Optimum documentation](https://huggingface.co/docs/optimum/index) to learn more about quantization methods that are supported by `optimum` and see if these are applicable for your use case.
diff --git a/docs/source/en/main_classes/trainer.md b/docs/source/en/main_classes/trainer.md
index e9a93bbff751b1..cf1dd672d3d472 100644
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@@ -26,7 +26,7 @@ If you're looking to fine-tune a language model like Llama-2 or Mistral on a tex
 
 Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.
 
-The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex](https://github.com/NVIDIA/apex) and Native AMP for PyTorch.
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex] for NVIDIA GPUs, [ROCm APEX](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, and Native AMP for PyTorch.
 
 The [`Trainer`] contains the basic training loop which supports the above features. To inject custom behavior you can subclass them and override the following methods:
 
@@ -206,7 +206,7 @@ Let's discuss how you can tell your program which GPUs are to be used and in wha
 When using [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) to use only a subset of your GPUs, you simply specify the number of GPUs to use. For example, if you have 4 GPUs, but you wish to use the first 2 you can do:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
+torchrun --nproc_per_node=2  trainer-program.py ...
 ```
 
 if you have either [`accelerate`](https://github.com/huggingface/accelerate) or [`deepspeed`](https://github.com/microsoft/DeepSpeed) installed you can also accomplish the same by using one of:
@@ -219,7 +219,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
 deepspeed --num_gpus 2 trainer-program.py ...
 ```
 
-You don't need to use the Accelerate or [the Deepspeed integration](Deepspeed) features to use these launchers.
+You don't need to use the Accelerate or [the Deepspeed integration](deepspeed) features to use these launchers.
 
 
 Until now you were able to tell the program how many GPUs to use. Now let's discuss how to select specific GPUs and control their order.
@@ -233,7 +233,7 @@ If you have multiple GPUs and you'd like to use only 1 or a few of those GPUs, s
 For example, let's say you have 4 GPUs: 0, 1, 2 and 3. To run only on the physical GPUs 0 and 2, you can do:
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
 
 So now pytorch will see only 2 GPUs, where your physical GPUs 0 and 2 are mapped to `cuda:0` and `cuda:1` correspondingly.
@@ -241,7 +241,7 @@ So now pytorch will see only 2 GPUs, where your physical GPUs 0 and 2 are mapped
 You can even change their order:
 
 ```bash
-CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
 ```
 
 Here your physical GPUs 0 and 2 are mapped to `cuda:1` and `cuda:0` correspondingly.
@@ -263,7 +263,7 @@ As with any environment variable you can, of course, export those instead of add
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0,2
-python -m torch.distributed.launch trainer-program.py ...
+torchrun trainer-program.py ...
 ```
 
 but this approach can be confusing since you may forget you set up the environment variable earlier and not understand why the wrong GPUs are used. Therefore, it's a common practice to set the environment variable just for a specific run on the same command line as it's shown in most examples of this section.
@@ -272,7 +272,7 @@ but this approach can be confusing since you may forget you set up the environme
 
 There is an additional environment variable `CUDA_DEVICE_ORDER` that controls how the physical devices are ordered. The two choices are:
 
-1. ordered by PCIe bus IDs (matches `nvidia-smi`'s order) - this is the default.
+1. ordered by PCIe bus IDs (matches `nvidia-smi` and `rocm-smi`'s order) - this is the default.
 
 ```bash
 export CUDA_DEVICE_ORDER=PCI_BUS_ID
@@ -284,7 +284,7 @@ export CUDA_DEVICE_ORDER=PCI_BUS_ID
 export CUDA_DEVICE_ORDER=FASTEST_FIRST
 ```
 
-Most of the time you don't need to care about this environment variable, but it's very helpful if you have a lopsided setup where you have an old and a new GPUs physically inserted in such a way so that the slow older card appears to be first. One way to fix that is to swap the cards. But if you can't swap the cards (e.g., if the cooling of the devices gets impacted) then setting `CUDA_DEVICE_ORDER=FASTEST_FIRST` will always put the newer faster card first. It'll be somewhat confusing though since `nvidia-smi` will still report them in the PCIe order.
+Most of the time you don't need to care about this environment variable, but it's very helpful if you have a lopsided setup where you have an old and a new GPUs physically inserted in such a way so that the slow older card appears to be first. One way to fix that is to swap the cards. But if you can't swap the cards (e.g., if the cooling of the devices gets impacted) then setting `CUDA_DEVICE_ORDER=FASTEST_FIRST` will always put the newer faster card first. It'll be somewhat confusing though since `nvidia-smi` (or `rocm-smi`) will still report them in the PCIe order.
 
 The other solution to swapping the order is to use:
 
@@ -426,8 +426,7 @@ To read more about it and the benefits, check out the [Fully Sharded Data Parall
 We have integrated the latest PyTorch's Fully Sharded Data Parallel (FSDP) training feature.
 All you need to do is enable it through the config.
 
-**Required PyTorch version for FSDP support**: PyTorch Nightly (or 1.12.0 if you read this after it has been released)
-as the model saving with FSDP activated is only available with recent fixes.
+**Required PyTorch version for FSDP support**: PyTorch >=2.1.0
 
 **Usage**:
 
@@ -440,6 +439,8 @@ as the model saving with FSDP activated is only available with recent fixes.
   - SHARD_GRAD_OP : Shards optimizer states + gradients across data parallel workers/GPUs.
     For this, add `--fsdp shard_grad_op` to the command line arguments.
   - NO_SHARD : No sharding. For this, add `--fsdp no_shard` to the command line arguments.
+  - HYBRID_SHARD : No sharding. For this, add `--fsdp hybrid_shard` to the command line arguments.
+  - HYBRID_SHARD_ZERO2 : No sharding. For this, add `--fsdp hybrid_shard_zero2` to the command line arguments.
 - To offload the parameters and gradients to the CPU, 
   add `--fsdp "full_shard offload"` or `--fsdp "shard_grad_op offload"` to the command line arguments.
 - To automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`, 
@@ -449,18 +450,18 @@ as the model saving with FSDP activated is only available with recent fixes.
 - Remaining FSDP config is passed via `--fsdp_config <path_to_fsdp_config.json>`. It is either a location of
   FSDP json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. 
   - If auto wrapping is enabled, you can either use transformer based auto wrap policy or size based auto wrap policy.
-    - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+    - For transformer based auto wrap policy, it is recommended to specify `transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
       This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
       This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
       Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
       Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
       Therefore, use this for transformer based models.
-    - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+    - For size based auto wrap policy, please add `min_num_params` in the config file. 
       It specifies FSDP's minimum number of parameters for auto wrapping.
-  - `fsdp_backward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+  - `backward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
     `backward_pre` and `backward_pos` are available options. 
     For more information refer `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`
-  - `fsdp_forward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
+  - `forward_prefetch` can be specified in the config file. It controls when to prefetch next set of parameters. 
     If `"True"`, FSDP explicitly prefetches the next upcoming all-gather while executing in the forward pass. 
   - `limit_all_gathers` can be specified in the config file. 
     If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight all-gathers.
@@ -468,6 +469,20 @@ as the model saving with FSDP activated is only available with recent fixes.
     If `"True"`, FSDP activation checkpointing is a technique to reduce memory usage by clearing activations of
     certain layers and recomputing them during a backward pass. Effectively, this trades extra computation time
     for reduced memory usage.
+  - `use_orig_params` can be specified in the config file. 
+    If True, allows non-uniform `requires_grad` during init, which means support for interspersed frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. This also enables to have different optimizer param groups. This should be `True` when creating optimizer object before preparing/wrapping the model with FSDP.
+    Please refer this [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019). 
+
+**Saving and loading**
+Saving entire intermediate checkpoints using `FULL_STATE_DICT` state_dict_type with CPU offloading on rank 0 takes a lot of time and often results in NCCL Timeout errors due to indefinite hanging during broadcasting. However, at the end of training, we want the whole model state dict instead of the sharded state dict which is only compatible with FSDP. Use `SHARDED_STATE_DICT` (default) state_dict_type to save the intermediate checkpoints and optimizer states in this format recommended by the PyTorch team. 
+
+Saving the final checkpoint in transformers format using default `safetensors` format requires below changes.
+```python
+if trainer.is_fsdp_enabled:
+    trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
+
+trainer.save_model(script_args.output_dir)
+```
 
 **Few caveats to be aware of**
 - it is incompatible with `generate`, thus is incompatible with `--predict_with_generate` 
@@ -492,15 +507,15 @@ Pass `--fsdp "full shard"` along with following changes to be made in `--fsdp_co
   https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py).
 - `xla_fsdp_grad_ckpt`. When `True`, uses gradient checkpointing over each nested XLA FSDP wrapped layer. 
   This setting can only be used when the xla flag is set to true, and an auto wrapping policy is specified through
-  `fsdp_min_num_params` or `fsdp_transformer_layer_cls_to_wrap`. 
+  `min_num_params` or `transformer_layer_cls_to_wrap`. 
 - You can either use transformer based auto wrap policy or size based auto wrap policy.
-  - For transformer based auto wrap policy, it is recommended to specify `fsdp_transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
+  - For transformer based auto wrap policy, it is recommended to specify `transformer_layer_cls_to_wrap` in the config file. If not specified, the default value is `model._no_split_modules` when available.
     This specifies the list of transformer layer class name (case-sensitive) to wrap ,e.g, [`BertLayer`], [`GPTJBlock`], [`T5Block`] ....
     This is important because submodules that share weights (e.g., embedding layer) should not end up in different FSDP wrapped units.
     Using this policy, wrapping happens for each block containing Multi-Head Attention followed by couple of MLP layers. 
     Remaining layers including the shared embeddings are conveniently wrapped in same outermost FSDP unit.
     Therefore, use this for transformer based models.
-  - For size based auto wrap policy, please add `fsdp_min_num_params` in the config file. 
+  - For size based auto wrap policy, please add `min_num_params` in the config file. 
     It specifies FSDP's minimum number of parameters for auto wrapping.
 
 
diff --git a/docs/source/en/model_doc/albert.md b/docs/source/en/model_doc/albert.md
index b7a819b2ed4628..a75e6757804862 100644
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@@ -59,13 +59,67 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
 
+
+
+This model was contributed by [lysandre](https://huggingface.co/lysandre). This model jax version was contributed by
+[kamalkraj](https://huggingface.co/kamalkraj). The original code can be found [here](https://github.com/google-research/ALBERT).
+
+
 ## Resources
 
-- [Text classification task guide](../tasks/sequence_classification)
-- [Token classification task guide](../tasks/token_classification)
-- [Question answering task guide](../tasks/question_answering)
-- [Masked language modeling task guide](../tasks/masked_language_modeling)
-- [Multiple choice task guide](../tasks/multiple_choice)
+
+The resources provided in the following sections consist of a list of official Hugging Face and community (indicated by 🌎) resources to help you get started with AlBERT. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+
+<PipelineTag pipeline="text-classification"/>
+
+
+- [`AlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification).
+
+
+- [`TFAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification).
+
+- [`FlaxAlbertForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb).
+- Check the [Text classification task guide](../tasks/sequence_classification) on how to use the model.
+
+
+<PipelineTag pipeline="token-classification"/>
+
+
+- [`AlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification).
+
+
+- [`TFAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
+
+
+
+- [`FlaxAlbertForTokenClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification).
+- [Token classification](https://huggingface.co/course/chapter7/2?fw=pt) chapter of the 🤗 Hugging Face Course.
+- Check the [Token classification task guide](../tasks/token_classification) on how to use the model.
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`AlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+- [`TFAlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
+- [`FlaxAlbertForMaskedLM`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb).
+- [Masked language modeling](https://huggingface.co/course/chapter7/3?fw=pt) chapter of the 🤗 Hugging Face Course.
+- Check the [Masked language modeling task guide](../tasks/masked_language_modeling) on how to use the model.
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`AlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
+- [`TFAlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
+- [`FlaxAlbertForQuestionAnswering`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering).
+- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter of the 🤗 Hugging Face Course.
+- Check the [Question answering task guide](../tasks/question_answering) on how to use the model.
+
+**Multiple choice**
+
+- [`AlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb).
+- [`TFAlbertForMultipleChoice`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb).
+
+- Check the  [Multiple choice task guide](../tasks/multiple_choice) on how to use the model.
+
 
 ## AlbertConfig
 
diff --git a/docs/source/en/model_doc/auto.md b/docs/source/en/model_doc/auto.md
index 75d162bbcf13cf..9dbaaf3acbbbb6 100644
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@@ -49,7 +49,7 @@ You will then be able to use the auto classes like you would usually do!
 
 <Tip warning={true}>
 
-If your `NewModelConfig` is a subclass of [`~transformer.PretrainedConfig`], make sure its
+If your `NewModelConfig` is a subclass of [`~transformers.PretrainedConfig`], make sure its
 `model_type` attribute is set to the same key you use when registering the config (here `"new-model"`).
 
 Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its
diff --git a/docs/source/en/model_doc/bark.md b/docs/source/en/model_doc/bark.md
index 2160159bd783a3..7c02e4be701187 100644
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@@ -83,10 +83,10 @@ pip install -U flash-attn --no-build-isolation
 
 ##### Usage
 
-To load a model using Flash Attention 2, we can pass the `use_flash_attention_2` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:
+To load a model using Flash Attention 2, we can pass the `attn_implementation="flash_attention_2"` flag to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:
 
 ```python
-model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, use_flash_attention_2=True).to(device)
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
 ```
 
 ##### Performance comparison
@@ -114,7 +114,7 @@ import torch
 device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # load in fp16 and use Flash Attention 2
-model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, use_flash_attention_2=True).to(device)
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
 
 # enable CPU offload
 model.enable_cpu_offload()
diff --git a/docs/source/en/model_doc/biogpt.md b/docs/source/en/model_doc/biogpt.md
index 1cac6d10990dca..20a8e4d9cd307c 100644
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@@ -18,8 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## Overview
 
-The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining
-](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
+The BioGPT model was proposed in [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu. BioGPT is a domain-specific generative pre-trained Transformer language model for biomedical text generation and mining. BioGPT follows the Transformer language model backbone, and is pre-trained on 15M PubMed abstracts from scratch.
 
 The abstract from the paper is the following:
 
diff --git a/docs/source/en/model_doc/detr.md b/docs/source/en/model_doc/detr.md
index c36bd4380edf89..60937b6012119f 100644
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@@ -146,7 +146,7 @@ As a summary, consider the following table:
 | **Model** | [`~transformers.DetrForObjectDetection`] | [`~transformers.DetrForSegmentation`] | [`~transformers.DetrForSegmentation`] |
 | **Example dataset** | COCO detection | COCO detection, COCO panoptic | COCO panoptic  |                                                                        |
 | **Format of annotations to provide to**  [`~transformers.DetrImageProcessor`] | {'image_id': `int`, 'annotations': `List[Dict]`} each Dict being a COCO object annotation  | {'image_id': `int`, 'annotations': `List[Dict]`}  (in case of COCO detection) or {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} (in case of COCO panoptic) | {'file_name': `str`, 'image_id': `int`, 'segments_info': `List[Dict]`} and masks_path (path to directory containing PNG files of the masks) |
-| **Postprocessing** (i.e. converting the output of the model to COCO API) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
+| **Postprocessing** (i.e. converting the output of the model to Pascal VOC format) | [`~transformers.DetrImageProcessor.post_process`] | [`~transformers.DetrImageProcessor.post_process_segmentation`] | [`~transformers.DetrImageProcessor.post_process_segmentation`], [`~transformers.DetrImageProcessor.post_process_panoptic`] |
 | **evaluators** | `CocoEvaluator` with `iou_types="bbox"` | `CocoEvaluator` with `iou_types="bbox"` or `"segm"` | `CocoEvaluator` with `iou_tupes="bbox"` or `"segm"`, `PanopticEvaluator` |
 
 In short, one should prepare the data either in COCO detection or COCO panoptic format, then use
diff --git a/docs/source/en/model_doc/dinov2.md b/docs/source/en/model_doc/dinov2.md
index 49a5bd3e260f7b..dca94786773d1d 100644
--- a/docs/source/en/model_doc/dinov2.md
+++ b/docs/source/en/model_doc/dinov2.md
@@ -25,6 +25,49 @@ The abstract from the paper is the following:
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/facebookresearch/dinov2).
 
+## Usage tips
+
+The model can be traced using `torch.jit.trace` which leverages JIT compilation to optimize the model making it faster to run. Note this still produces some mis-matched elements and the difference between the original model and the traced model is of the order of 1e-4.
+
+```python
+import torch
+from transformers import AutoImageProcessor, AutoModel
+from PIL import Image
+import requests
+
+url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+image = Image.open(requests.get(url, stream=True).raw)
+
+processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
+model = AutoModel.from_pretrained('facebook/dinov2-base')
+
+inputs = processor(images=image, return_tensors="pt")
+outputs = model(**inputs)
+last_hidden_states = outputs[0]
+
+# We have to force return_dict=False for tracing
+model.config.return_dict = False
+
+with torch.no_grad():
+    traced_model = torch.jit.trace(model, [inputs.pixel_values])
+    traced_outputs = traced_model(inputs.pixel_values)
+
+print((last_hidden_states - traced_outputs[0]).abs().max())
+```
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT.
+
+- Demo notebooks for DINOv2 can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/DINOv2). 🌎
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`Dinov2ForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
+- See also: [Image classification task guide](../tasks/image_classification)
+
+If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
 ## Dinov2Config
 
 [[autodoc]] Dinov2Config
diff --git a/docs/source/en/model_doc/distilbert.md b/docs/source/en/model_doc/distilbert.md
index 5742380c517bda..bd39260d3ca492 100644
--- a/docs/source/en/model_doc/distilbert.md
+++ b/docs/source/en/model_doc/distilbert.md
@@ -32,7 +32,7 @@ rendered properly in your Markdown viewer.
 
 The DistilBERT model was proposed in the blog post [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
 distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5), and the paper [DistilBERT, a
-distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/papers/1910.01108). DistilBERT is a
+distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108). DistilBERT is a
 small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
 *bert-base-uncased*, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
 understanding benchmark.
@@ -153,7 +153,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> device = "cuda" # the device to load the model onto
 
 >>> tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
->>> model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> model = AutoModel.from_pretrained("distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 
 >>> text = "Replace me by any text you'd like."
 
diff --git a/docs/source/en/model_doc/dpt.md b/docs/source/en/model_doc/dpt.md
index 5e3e25343cdda4..a02313a31235dd 100644
--- a/docs/source/en/model_doc/dpt.md
+++ b/docs/source/en/model_doc/dpt.md
@@ -32,6 +32,21 @@ alt="drawing" width="600"/>
 
 This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/isl-org/DPT).
 
+## Usage tips
+
+DPT is compatible with the [`AutoBackbone`] class. This allows to use the DPT framework with various computer vision backbones available in the library, such as [`VitDetBackbone`] or [`Dinov2Backbone`]. One can create it as follows:
+
+```python
+from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation
+
+# initialize with a Transformer-based backbone such as DINOv2
+# in that case, we also specify `reshape_hidden_states=False` to get feature maps of shape (batch_size, num_channels, height, width)
+backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False)
+
+config = DPTConfig(backbone_config=backbone_config)
+model = DPTForDepthEstimation(config=config)
+```
+
 ## Resources
 
 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT.
diff --git a/docs/source/en/model_doc/gpt_bigcode.md b/docs/source/en/model_doc/gpt_bigcode.md
index 0f3bc72d03a55f..b3cb078e2a140c 100644
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@@ -59,7 +59,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto
 
->>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 >>> tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder")
 
 >>> prompt = "def hello_world():"
diff --git a/docs/source/en/model_doc/gpt_neo.md b/docs/source/en/model_doc/gpt_neo.md
index fb2385bc73db2e..3c7858c998207e 100644
--- a/docs/source/en/model_doc/gpt_neo.md
+++ b/docs/source/en/model_doc/gpt_neo.md
@@ -56,13 +56,9 @@ The `generate()` method can be used to generate text using GPT Neo model.
 
 ## Combining GPT-Neo and Flash Attention 2
 
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature, and make sure your hardware is compatible with Flash-Attention 2. More details are available [here](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2) concerning the installation.
 
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+Make sure as well to load your model in half-precision (e.g. `torch.float16`).
 
 To load and run a model using Flash Attention 2, refer to the snippet below:
 
@@ -71,7 +67,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto
 
->>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B")
 
 >>> prompt = "def hello_world():"
diff --git a/docs/source/en/model_doc/gpt_neox.md b/docs/source/en/model_doc/gpt_neox.md
index 300001ad5bb1fa..fd105a3e82e1ee 100644
--- a/docs/source/en/model_doc/gpt_neox.md
+++ b/docs/source/en/model_doc/gpt_neox.md
@@ -61,6 +61,40 @@ The `generate()` method can be used to generate text using GPT Neo model.
 >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
 ```
 
+## Using Flash Attention 2
+
+Flash Attention 2 is an faster, optimized version of the model.
+
+### Installation 
+
+First, check whether your hardware is compatible with Flash Attention 2. The latest list of compatible hardware can be found in the [official documentation](https://github.com/Dao-AILab/flash-attention#installation-and-features). If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered [above](https://huggingface.co/docs/transformers/main/en/model_doc/bark#using-better-transformer).
+
+Next, [install](https://github.com/Dao-AILab/flash-attention#installation-and-features) the latest version of Flash Attention 2:
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+### Usage
+
+To load a model using Flash Attention 2, we can pass the argument `attn_implementation="flash_attention_2"` to [`.from_pretrained`](https://huggingface.co/docs/transformers/main/en/main_classes/model#transformers.PreTrainedModel.from_pretrained). We'll also load the model in half-precision (e.g. `torch.float16`), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference:
+
+```python
+>>> from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast
+
+model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device)
+...
+```
+
+
+### Expected speedups
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `stockmark/gpt-neox-japanese-1.4b` checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/gpt-neox-1.8b-speedup.jpg">
+</div>
+
 ## Resources
 
 - [Causal language modeling task guide](../tasks/language_modeling)
diff --git a/docs/source/en/model_doc/llama.md b/docs/source/en/model_doc/llama.md
index 9f55c425d448f7..96f2a2e7eb7cdd 100644
--- a/docs/source/en/model_doc/llama.md
+++ b/docs/source/en/model_doc/llama.md
@@ -50,6 +50,9 @@ come in several checkpoints they each contain a part of each weight of the model
 
 - The LLaMA tokenizer is a BPE model based on [sentencepiece](https://github.com/google/sentencepiece). One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. "Banana"), the tokenizer does not prepend the prefix space to the string.
 
+This model was contributed by [zphang](https://huggingface.co/zphang) with contributions from [BlackSamorez](https://huggingface.co/BlackSamorez). The code of the implementation in Hugging Face is based on GPT-NeoX [here](https://github.com/EleutherAI/gpt-neox). The original code of the authors can be found [here](https://github.com/facebookresearch/llama). The Flax version of the implementation was contributed by [afmck](https://huggingface.co/afmck) with the code in the implementation based on Hugging Face's Flax GPT-Neo.
+
+
 Based on the original LLaMA model, Meta AI has released some follow-up works:
 
 - **Llama2**: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. Refer to the documentation of Llama2 which can be found [here](llama2).
@@ -112,3 +115,13 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 
 [[autodoc]] LlamaForSequenceClassification
     - forward
+
+## FlaxLlamaModel
+
+[[autodoc]] FlaxLlamaModel
+    - __call__
+
+## FlaxLlamaForCausalLM
+
+[[autodoc]] FlaxLlamaForCausalLM
+    - __call__
diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md
new file mode 100644
index 00000000000000..ee7d9bbd1af9be
--- /dev/null
+++ b/docs/source/en/model_doc/llava.md
@@ -0,0 +1,80 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# LLaVa
+
+## Overview
+
+LLaVa is an open-source chatbot trained by fine-tuning LlamA/Vicuna on GPT-generated multimodal instruction-following data. It is an auto-regressive language model, based on the transformer architecture. In other words, it is an multi-modal version of LLMs fine-tuned for chat / instructions.
+
+The LLaVa model was proposed in [Visual Instruction Tuning](https://arxiv.org/abs/2304.08485) and improved in [Improved Baselines with Visual Instruction Tuning](https://arxiv.org/pdf/2310.03744) by Haotian Liu, Chunyuan Li, Yuheng Li and Yong Jae Lee.
+
+The abstract from the paper is the following:
+
+*Large multimodal models (LMM) have recently shown encouraging progress with visual instruction tuning. In this note, we show that the fully-connected vision-language cross-modal connector in LLaVA is surprisingly powerful and data-efficient. With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ∼1 day on a single 8-A100 node. We hope this can make state-of-the-art LMM research more accessible. Code and model will be publicly available*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> LLaVa architecture. Taken from the <a href="https://arxiv.org/abs/2304.08485">original paper.</a> </small>
+
+This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ) and [ybelkada](https://huggingface.co/ybelkada).
+The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/main/llava).
+
+## Usage tips
+
+- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
+
+- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
+
+- For better results, we recommend users to prompt the model with the correct prompt format: 
+
+```bash
+"USER: <image>\n<prompt>ASSISTANT:"
+```
+
+For multiple turns conversation:
+
+```bash
+"USER: <image>\n<prompt1>ASSISTANT: <answer1>USER: <prompt2>ASSISTANT: <answer2>USER: <prompt3>ASSISTANT:"
+```
+
+### Using Flash Attention 2
+
+Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the [Flash Attention 2 section of performance docs](https://huggingface.co/docs/transformers/perf_infer_gpu_one).
+
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT.
+
+<PipelineTag pipeline="image-to-text"/>
+
+- A [Google Colab demo](https://colab.research.google.com/drive/1qsl6cd2c8gGtEW1xV5io7S8NHh-Cp1TV?usp=sharing) on how to run Llava on a free-tier Google colab instance leveraging 4-bit inference.
+- A [similar notebook](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LLaVa/Inference_with_LLaVa_for_multimodal_generation.ipynb) showcasing batched inference. 🌎
+
+
+## LlavaConfig
+
+[[autodoc]] LlavaConfig
+
+## LlavaProcessor
+
+[[autodoc]] LlavaProcessor
+
+## LlavaForConditionalGeneration
+
+[[autodoc]] LlavaForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/madlad-400.md b/docs/source/en/model_doc/madlad-400.md
new file mode 100644
index 00000000000000..aeb41938499c29
--- /dev/null
+++ b/docs/source/en/model_doc/madlad-400.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# MADLAD-400
+
+## Overview
+
+MADLAD-400 models were released in the paper [MADLAD-400: A Multilingual And Document-Level Large Audited Dataset](MADLAD-400: A Multilingual And Document-Level Large Audited Dataset). 
+
+The abstract from the paper is the following: 
+
+*We introduce MADLAD-400, a manually audited, general domain 3T token monolingual dataset based on CommonCrawl, spanning 419 languages. We discuss 
+the limitations revealed by self-auditing MADLAD-400, and the role data auditing
+had in the dataset creation process. We then train and release a 10.7B-parameter
+multilingual machine translation model on 250 billion tokens covering over 450
+languages using publicly available data, and find that it is competitive with models
+that are significantly larger, and report the results on different domains. In addition, we train a 8B-parameter language model, and assess the results on few-shot
+translation. We make the baseline models 1
+available to the research community.*
+
+This model was added by [Juarez Bochi](https://huggingface.co/jbochi). The original checkpoints can be found [here](https://github.com/google-research/google-research/tree/master/madlad_400). 
+
+This is a machine translation model that supports many low-resource languages, and that is competitive with models that are significantly larger.
+
+One can directly use MADLAD-400 weights without finetuning the model:
+
+```python
+>>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/madlad400-3b-mt")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/madlad400-3b-mt")
+
+>>> inputs = tokenizer("<2pt> I love pizza!", return_tensors="pt")
+>>> outputs = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['Eu amo pizza!']
+```
+
+Google has released the following variants:
+
+- [google/madlad400-3b-mt](https://huggingface.co/google/madlad400-3b-mt)
+
+- [google/madlad400-7b-mt](https://huggingface.co/google/madlad400-7b-mt)
+
+- [google/madlad400-7b-mt-bt](https://huggingface.co/google/madlad400-7b-mt-bt)
+
+- [google/madlad400-10b-mt](https://huggingface.co/google/madlad400-10b-mt)
+
+The original checkpoints can be found [here](https://github.com/google-research/google-research/tree/master/madlad_400).
+
+<Tip>
+
+Refer to [T5's documentation page](t5) for all API references, code examples, and notebooks. For more details regarding training and evaluation of the MADLAD-400, refer to the model card.
+
+</Tip>
diff --git a/docs/source/en/model_doc/mistral.md b/docs/source/en/model_doc/mistral.md
index 8e37bc2caf888d..8e4d75ef2382c3 100644
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@@ -99,7 +99,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto
 
->>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
 >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
 
 >>> prompt = "My favourite condiment is"
diff --git a/docs/source/en/model_doc/mixtral.md b/docs/source/en/model_doc/mixtral.md
new file mode 100644
index 00000000000000..9f1fb49f6c835c
--- /dev/null
+++ b/docs/source/en/model_doc/mixtral.md
@@ -0,0 +1,163 @@
+<!--Copyright 2023 Mistral AI and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mixtral
+
+## Overview
+
+Mixtral-8x7B is Mistral AI's second Large Language Model (LLM). 
+
+The Mixtral model was proposed in the by the [Mistral AI](https://mistral.ai/) team.
+
+It was introduced in the [Mixtral of Experts blogpost](https://mistral.ai/news/mixtral-of-experts/) with the following introduction:
+
+*Today, the team is proud to release Mixtral 8x7B, a high-quality sparse mixture of experts models (SMoE) with open weights. Licensed under Apache 2.0. Mixtral outperforms Llama 2 70B on most benchmarks with 6x faster inference. It is the strongest open-weight model with a permissive license and the best model overall regarding cost/performance trade-offs. In particular, it matches or outperforms GPT3.5 on most standard benchmarks.*
+
+Tips:
+
+
+- The model needs to be converted using the [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py).
+- If the model is quantized to 4bits, a single A100 is enough to fit the entire 45B model.
+
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ) .
+The original code can be found [here](https://github.com/mistralai/mistral-src).
+
+
+### Model Details
+
+Mixtral-45B is a decoder-based LM with the following architectural choices:
+
+* Mixtral is a Mixture of Expert (MOE) model with 8 experts per MLP, with a total of 45B paramateres but the compute required is the same as a 14B model. This is because even though each experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dipatched twice (top 2 routing) and thus the compute (the operation required at each foward computation) is just 2 X sequence_length. 
+
+The following implementation details are shared with Mistral AI's first model [mistral](~models/doc/mistral):
+* Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens
+* GQA (Grouped Query Attention) - allowing faster inference and lower cache size.
+* Byte-fallback BPE tokenizer - ensures that characters are never mapped to out of vocabulary tokens.
+
+They also provide an instruction fine-tuned model: `mistralai/Mixtral-8x7B-v0.1` which can be used for chat-based inference.
+
+For more details please read our [release blog post](https://mistral.ai/news/mixtral-of-experts/)
+
+### License
+
+`Mixtral-8x7B` is released under the Apache 2.0 license.
+
+## Usage tips
+
+`Mixtral-8x7B` can be found on the [Huggingface Hub](https://huggingface.co/mistralai)
+
+These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hub:
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-8x7B")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"The expected output"
+```
+
+To use the raw checkpoints with HuggingFace you can use the `convert_mixtral_weights_to_hf.py` script to convert them to the HuggingFace format:
+
+```bash
+python src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py \
+    --input_dir /path/to/downloaded/mistral/weights --output_dir /output/path
+```
+
+You can then load the converted model from the `output/path`:
+
+```python
+from transformers import MixtralForCausalLM, LlamaTokenizer
+
+tokenizer = LlamaTokenizer.from_pretrained("/output/path")
+model = MixtralForCausalLM.from_pretrained("/output/path")
+```
+
+## Combining Mixtral and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of [`flash-attn`](https://github.com/Dao-AILab/flash-attention) repository. Make also sure to load your model in half-precision (e.g. `torch.float16`)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1")
+
+>>> prompt = "My favourite condiment is"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
+>>> tokenizer.batch_decode(generated_ids)[0]
+"The expected output"
+```
+
+### Expected speedups
+
+Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using `mistralai/Mixtral-8x7B-v0.1` checkpoint and the Flash Attention 2 version of the model.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/mixtral-7b-inference-large-seqlen.png">
+</div>
+
+### Sliding window Attention
+
+The current implementation supports the sliding window attention mechanism and memory efficient cache management. 
+To enable sliding window attention, just make sure to have a `flash-attn` version that is compatible with sliding window attention (`>=2.3.0`). 
+
+The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (`self.config.sliding_window`), support batched generation only for `padding_side="left"` and use the absolute position of the current token to compute the positional embedding.
+
+## The Mistral Team
+
+Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
+
+## MixtralConfig
+
+[[autodoc]] MixtralConfig
+
+## MixtralModel
+
+[[autodoc]] MixtralModel
+    - forward
+
+## MixtralForCausalLM
+
+[[autodoc]] MixtralForCausalLM
+    - forward
+
+## MixtralForSequenceClassification
+
+[[autodoc]] MixtralForSequenceClassification
+    - forward
diff --git a/docs/source/en/model_doc/musicgen.md b/docs/source/en/model_doc/musicgen.md
index d9e84294852dc6..bc2234ce3c4102 100644
--- a/docs/source/en/model_doc/musicgen.md
+++ b/docs/source/en/model_doc/musicgen.md
@@ -46,6 +46,16 @@ This model was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-ga
 [here](https://github.com/facebookresearch/audiocraft). The pre-trained checkpoints can be found on the
 [Hugging Face Hub](https://huggingface.co/models?sort=downloads&search=facebook%2Fmusicgen-).
 
+## Usage tips
+
+- After downloading the original checkpoints from [here](https://github.com/facebookresearch/audiocraft/blob/main/docs/MUSICGEN.md#importing--exporting-models) , you can convert them using the **conversion script** available at
+`src/transformers/models/musicgen/convert_musicgen_transformers.py` with the following command:
+
+```bash
+python src/transformers/models/musicgen/convert_musicgen_transformers.py \
+    --checkpoint small --pytorch_dump_folder /output/path --safe_serialization 
+```
+
 ## Generation
 
 MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
diff --git a/docs/source/en/model_doc/opt.md b/docs/source/en/model_doc/opt.md
index 68da201f99bc72..1b02b888994ecf 100644
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@@ -62,6 +62,55 @@ The resource should ideally demonstrate something new instead of duplicating an
 
 - A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
 
+
+## Combining OPT and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import OPTForCausalLM, GPT2Tokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
+>>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
+
+>>> prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
+              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
+              "there?")
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+>>> tokenizer.batch_decode(generated_ids)[0]
+'</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love'
+```
+
+### Expected speedups
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-2.7b` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101546-d2fca6d2-ee44-48f3-9534-ba8d5bee4531.png">
+</div>
+
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-350m` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://user-images.githubusercontent.com/49240599/281101682-d1144e90-0dbc-46f4-8fc8-c6206cb793c9.png">
+</div>
+
+
+
 ## OPTConfig
 
 [[autodoc]] OPTConfig
diff --git a/docs/source/en/model_doc/owlv2.md b/docs/source/en/model_doc/owlv2.md
index 12000af9ed4f39..75fab0853a9778 100644
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@@ -56,7 +56,7 @@ OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditio
 
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
 >>> target_sizes = torch.Tensor([image.size[::-1]])
->>> # Convert outputs (bounding boxes and class logits) to COCO API
+>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax)
 >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
 >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
 >>> text = texts[i]
diff --git a/docs/source/en/model_doc/owlvit.md b/docs/source/en/model_doc/owlvit.md
index 0ba26eeb37b4cd..c40d3a9e7a17fa 100644
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@@ -55,7 +55,7 @@ OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CL
 
 >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
 >>> target_sizes = torch.Tensor([image.size[::-1]])
->>> # Convert outputs (bounding boxes and class logits) to COCO API
+>>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
 >>> results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1)
 >>> i = 0  # Retrieve predictions for the first image for the corresponding text queries
 >>> text = texts[i]
diff --git a/docs/source/en/model_doc/patchtsmixer.md b/docs/source/en/model_doc/patchtsmixer.md
new file mode 100644
index 00000000000000..fe1de509fd0000
--- /dev/null
+++ b/docs/source/en/model_doc/patchtsmixer.md
@@ -0,0 +1,90 @@
+<!--Copyright 2023 IBM and HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PatchTSMixer
+
+## Overview
+
+The PatchTSMixer model was proposed in [TSMixer: Lightweight MLP-Mixer Model for Multivariate Time Series Forecasting](https://arxiv.org/pdf/2306.09364.pdf) by Vijay Ekambaram, Arindam Jati, Nam Nguyen, Phanwadee Sinthong and Jayant Kalagnanam.
+
+
+PatchTSMixer is a lightweight time-series modeling approach based on the MLP-Mixer architecture. In this HuggingFace implementation, we provide PatchTSMixer's capabilities to effortlessly facilitate lightweight mixing across patches, channels, and hidden features for effective multivariate time-series modeling. It also supports various attention mechanisms starting from simple gated attention to more complex self-attention blocks that can be customized accordingly. The model can be pretrained and subsequently used for various downstream tasks such as forecasting, classification and regression.
+
+
+The abstract from the paper is the following:
+
+*TSMixer is a lightweight neural architecture exclusively composed of multi-layer perceptron (MLP) modules designed for multivariate forecasting and representation learning on patched time series. Our model draws inspiration from the success of MLP-Mixer models in computer vision. We demonstrate the challenges involved in adapting Vision MLP-Mixer for time series and introduce empirically validated components to enhance accuracy. This includes a novel design paradigm of attaching online reconciliation heads to the MLP-Mixer backbone, for explicitly modeling the time-series properties such as hierarchy and channel-correlations. We also propose a Hybrid channel modeling approach to effectively handle noisy channel interactions and generalization across diverse datasets, a common challenge in existing patch channel-mixing methods. Additionally, a simple gated attention mechanism is introduced in the backbone to prioritize important features. By incorporating these lightweight components, we significantly enhance the learning capability of simple MLP structures, outperforming complex Transformer models with minimal computing usage. Moreover, TSMixer's modular design enables compatibility with both supervised and masked self-supervised learning methods, making it a promising building block for time-series Foundation Models. TSMixer outperforms state-of-the-art MLP and Transformer models in forecasting by a considerable margin of 8-60%. It also outperforms the latest strong benchmarks of Patch-Transformer models (by 1-2%) with a significant reduction in memory and runtime (2-3X).*
+
+
+
+This model was contributed by [ajati](https://huggingface.co/ajati), [vijaye12](https://huggingface.co/vijaye12), 
+[gsinthong](https://huggingface.co/gsinthong), [namctin](https://huggingface.co/namctin),
+[wmgifford](https://huggingface.co/wmgifford), [kashif](https://huggingface.co/kashif).
+
+
+## Sample usage 
+```python
+
+from transformers import PatchTSMixerConfig, PatchTSMixerForPrediction
+from transformers import Trainer, TrainingArguments,
+
+
+config = PatchTSMixerConfig(context_length = 512, prediction_length = 96)
+model = PatchTSMixerForPrediction(config)
+trainer = Trainer(model=model, args=training_args, 
+            train_dataset=train_dataset,
+            eval_dataset=valid_dataset)
+trainer.train()
+results = trainer.evaluate(test_dataset)
+```
+
+## Usage tips
+
+The model can also be used for time series classification and time series regression. See the respective [`PatchTSMixerForTimeSeriesClassification`] and [`PatchTSMixerForRegression`] classes.
+
+## PatchTSMixerConfig
+
+[[autodoc]] PatchTSMixerConfig
+
+
+## PatchTSMixerModel
+
+[[autodoc]] PatchTSMixerModel
+    - forward
+
+
+## PatchTSMixerForPrediction
+
+[[autodoc]] PatchTSMixerForPrediction
+    - forward
+
+
+## PatchTSMixerForTimeSeriesClassification
+
+[[autodoc]] PatchTSMixerForTimeSeriesClassification
+    - forward
+
+
+## PatchTSMixerForPretraining
+
+[[autodoc]] PatchTSMixerForPretraining
+    - forward
+
+
+## PatchTSMixerForRegression
+
+[[autodoc]] PatchTSMixerForRegression
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/patchtst.md b/docs/source/en/model_doc/patchtst.md
new file mode 100644
index 00000000000000..a6b8396a286b8c
--- /dev/null
+++ b/docs/source/en/model_doc/patchtst.md
@@ -0,0 +1,65 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PatchTST
+
+## Overview
+
+The PatchTST model was proposed in [A Time Series is Worth 64 Words: Long-term Forecasting with Transformers](https://arxiv.org/abs/2211.14730) by Yuqi Nie, Nam H. Nguyen, Phanwadee Sinthong and Jayant Kalagnanam.
+
+At a high level the model vectorizes time series into patches of a given size and encodes the resulting sequence of vectors via a Transformer that then outputs the prediction length forecast via an appropriate head. The model is illustrated in the following figure:
+
+![model](https://github.com/namctin/transformers/assets/8100/150af169-29de-419a-8d98-eb78251c21fa)
+
+The abstract from the paper is the following:
+
+*We propose an efficient design of Transformer-based models for multivariate time series forecasting and self-supervised representation learning. It is based on two key components: (i) segmentation of time series into subseries-level patches which are served as input tokens to Transformer; (ii) channel-independence where each channel contains a single univariate time series that shares the same embedding and Transformer weights across all the series. Patching design naturally has three-fold benefit: local semantic information is retained in the embedding; computation and memory usage of the attention maps are quadratically reduced given the same look-back window; and the model can attend longer history. Our channel-independent patch time series Transformer (PatchTST) can improve the long-term forecasting accuracy significantly when compared with that of SOTA Transformer-based models. We also apply our model to self-supervised pre-training tasks and attain excellent fine-tuning performance, which outperforms supervised training on large datasets. Transferring of masked pre-trained representation on one dataset to others also produces SOTA forecasting accuracy.*
+
+This model was contributed by [namctin](https://huggingface.co/namctin), [gsinthong](https://huggingface.co/gsinthong), [diepi](https://huggingface.co/diepi), [vijaye12](https://huggingface.co/vijaye12), [wmgifford](https://huggingface.co/wmgifford), and [kashif](https://huggingface.co/kashif). The original code can be found [here](https://github.com/yuqinie98/PatchTST).
+
+## Usage tips
+
+The model can also be used for time series classification and time series regression. See the respective [`PatchTSTForClassification`] and [`PatchTSTForRegression`] classes.
+
+
+## PatchTSTConfig
+
+[[autodoc]] PatchTSTConfig
+
+## PatchTSTModel
+
+[[autodoc]] PatchTSTModel
+    - forward
+
+## PatchTSTForPrediction
+
+[[autodoc]] PatchTSTForPrediction
+    - forward
+
+## PatchTSTForClassification
+
+[[autodoc]] PatchTSTForClassification
+    - forward
+
+## PatchTSTForPretraining
+
+[[autodoc]] PatchTSTForPretraining
+    - forward
+
+## PatchTSTForRegression
+
+[[autodoc]] PatchTSTForRegression
+    - forward
diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md
index 337502ac31d63b..3076aa378cbe85 100644
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@@ -76,7 +76,7 @@ The original code for Phi-1 and Phi-1.5 can be found [here](https://huggingface.
 ```python
 >>> from transformers import PhiForCausalLM, AutoTokenizer
 
->>> # define the model and tokenzier.
+>>> # define the model and tokenizer.
 >>> model = PhiForCausalLM.from_pretrained("susnato/phi-1_5_dev")
 >>> tokenizer = AutoTokenizer.from_pretrained("susnato/phi-1_5_dev")
 
@@ -94,6 +94,46 @@ The original code for Phi-1 and Phi-1.5 can be found [here](https://huggingface.
 ```
 
 
+## Combining Phi and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import PhiForCausalLM, AutoTokenizer
+
+>>> # define the model and tokenizer and push the model and tokens to the GPU.
+>>> model = PhiForCausalLM.from_pretrained("susnato/phi-1_5_dev", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")
+>>> tokenizer = AutoTokenizer.from_pretrained("susnato/phi-1_5_dev")
+
+>>> # feel free to change the prompt to your liking.
+>>> prompt = "If I were an AI that had just achieved"
+
+>>> # apply the tokenizer.
+>>> tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+>>> # use the model to generate new tokens.
+>>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
+
+>>> tokenizer.batch_decode(generated_output)[0]
+'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
+```
+
+### Expected speedups
+Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `susnato/phi-1_dev` checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048.
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/phi_1_speedup_plot.jpg">
+</div>
+
+
 ## PhiConfig
 
 [[autodoc]] PhiConfig
@@ -123,4 +163,4 @@ The original code for Phi-1 and Phi-1.5 can be found [here](https://huggingface.
     - forward
 
 </pt>
-</frameworkcontent>
\ No newline at end of file
+</frameworkcontent>
diff --git a/docs/source/en/model_doc/seamless_m4t.md b/docs/source/en/model_doc/seamless_m4t.md
index 3a67a58a08a967..a75d0798cc4b46 100644
--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@@ -15,6 +15,7 @@ specific language governing permissions and limitations under the License.
 ## Overview
 
 The SeamlessM4T model was proposed in [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI.
+This is the version 1 release of the model. For the updated version 2 release, refer to the [Seamless M4T v2 docs](./seamless_m4t_v2.md).
 
 SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
 
diff --git a/docs/source/en/model_doc/seamless_m4t_v2.md b/docs/source/en/model_doc/seamless_m4t_v2.md
new file mode 100644
index 00000000000000..7288dabb91b699
--- /dev/null
+++ b/docs/source/en/model_doc/seamless_m4t_v2.md
@@ -0,0 +1,194 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SeamlessM4T-v2
+
+## Overview
+
+The SeamlessM4T-v2 model was proposed in [Seamless: Multilingual Expressive and Streaming Speech Translation](https://ai.meta.com/research/publications/seamless-multilingual-expressive-and-streaming-speech-translation/) by the Seamless Communication team from Meta AI.
+
+SeamlessM4T-v2 is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text. It is an improvement on the [previous version](./seamless_m4t.md). For more details on the differences between v1 and v2, refer to section [Difference with SeamlessM4T-v1](#difference-with-seamlessm4t-v1).
+
+SeamlessM4T-v2 enables multiple tasks without relying on separate models:
+
+- Speech-to-speech translation (S2ST)
+- Speech-to-text translation (S2TT)
+- Text-to-speech translation (T2ST)
+- Text-to-text translation (T2TT)
+- Automatic speech recognition (ASR)
+
+[`SeamlessM4Tv2Model`] can perform all the above tasks, but each task also has its own dedicated sub-model.
+
+The abstract from the paper is the following:
+
+*Recent advancements in automatic speech translation have dramatically expanded language coverage, improved multimodal capabilities, and enabled a wide range of tasks and functionalities. That said, large-scale automatic speech translation systems today lack key features that help machine-mediated communication feel seamless when compared to human-to-human dialogue. In this work, we introduce a family of models that enable end-to-end expressive and multilingual translations in a streaming fashion. First, we contribute an improved version of the massively multilingual and multimodal SeamlessM4T model—SeamlessM4T v2. This newer model, incorporating an updated UnitY2 framework, was trained on more low-resource language data. The expanded version of SeamlessAlign adds 114,800 hours of automatically aligned data for a total of 76 languages. SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. SeamlessExpressive enables translation that preserves vocal styles and prosody. Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one’s voice. As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below.*
+
+## Usage
+
+In the following example, we'll load an Arabic audio sample and an English text sample and convert them into Russian speech and French text.
+
+First, load the processor and a checkpoint of the model:
+
+```python
+>>> from transformers import AutoProcessor, SeamlessM4Tv2Model
+
+>>> processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large")
+>>> model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large")
+```
+
+You can seamlessly use this model on text or on audio, to generated either translated text or translated audio.
+
+Here is how to use the processor to process text and audio:
+
+```python
+>>> # let's load an audio sample from an Arabic speech corpus
+>>> from datasets import load_dataset
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
+>>> audio_sample = next(iter(dataset))["audio"]
+
+>>> # now, process it
+>>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+
+>>> # now, process some English text as well
+>>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
+```
+
+
+### Speech
+
+[`SeamlessM4Tv2Model`] can *seamlessly* generate text or speech with few or no changes. Let's target Russian voice translation:
+
+```python
+>>> audio_array_from_text = model.generate(**text_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
+>>> audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
+```
+
+With basically the same code, I've translated English text and Arabic speech to Russian speech samples.
+
+### Text
+
+Similarly, you can generate translated text from audio files or from text with the same model. You only have to pass `generate_speech=False` to [`SeamlessM4Tv2Model.generate`].
+This time, let's translate to French.
+
+```python 
+>>> # from audio
+>>> output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
+>>> translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+
+>>> # from text
+>>> output_tokens = model.generate(**text_inputs, tgt_lang="fra", generate_speech=False)
+>>> translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+```
+
+### Tips
+
+
+#### 1. Use dedicated models
+
+[`SeamlessM4Tv2Model`] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint.
+For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: 
+
+```python
+>>> from transformers import SeamlessM4Tv2ForSpeechToSpeech
+>>> model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained("facebook/seamless-m4t-v2-large")
+```
+
+Or you can replace the text-to-text generation snippet with the model dedicated to the T2TT task, you only have to remove `generate_speech=False`.
+
+```python
+>>> from transformers import SeamlessM4Tv2ForTextToText
+>>> model = SeamlessM4Tv2ForTextToText.from_pretrained("facebook/seamless-m4t-v2-large")
+```
+
+Feel free to try out [`SeamlessM4Tv2ForSpeechToText`] and [`SeamlessM4Tv2ForTextToSpeech`] as well.
+
+#### 2. Change the speaker identity
+
+You have the possibility to change the speaker used for speech synthesis with the `speaker_id` argument. Some `speaker_id` works better than other for some languages!
+
+#### 3. Change the generation strategy
+
+You can use different [generation strategies](../generation_strategies) for text generation, e.g `.generate(input_ids=input_ids, text_num_beams=4, text_do_sample=True)` which will perform multinomial beam-search decoding on the text model. Note that speech generation only supports greedy - by default - or multinomial sampling, which can be used with e.g. `.generate(..., speech_do_sample=True, speech_temperature=0.6)`.
+
+#### 4. Generate speech and text at the same time
+
+Use `return_intermediate_token_ids=True` with [`SeamlessM4Tv2Model`] to return both speech and text !
+
+## Model architecture
+
+SeamlessM4T-v2 features a versatile architecture that smoothly handles the sequential generation of text and speech. This setup comprises two sequence-to-sequence (seq2seq) models. The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text.
+
+Each modality has its own dedicated encoder with a unique architecture. Additionally, for speech output, a vocoder inspired by the [HiFi-GAN](https://arxiv.org/abs/2010.05646) architecture is placed on top of the second seq2seq model.
+
+### Difference with SeamlessM4T-v1
+
+The architecture of this new version differs from the first in a few aspects:
+
+#### Improvements on the second-pass model
+
+The second seq2seq model, named text-to-unit model, is now non-auto regressive, meaning that it computes units in a **single forward pass**. This achievement is made possible by:
+- the use of **character-level embeddings**, meaning that each character of the predicted translated text has its own embeddings, which are then used to predict the unit tokens.
+- the use of an intermediate duration predictor, that predicts speech duration at the **character-level** on the predicted translated text.
+- the use of a new text-to-unit decoder mixing convolutions and self-attention to handle longer context.
+
+#### Difference in the speech encoder
+
+The speech encoder, which is used during the first-pass generation process to predict the translated text, differs mainly from the previous speech encoder through these mechanisms:
+- the use of chunked attention mask to prevent attention across chunks, ensuring that each position attends only to positions within its own chunk and a fixed number of previous chunks.
+- the use of relative position embeddings which only considers distance between sequence elements rather than absolute positions. Please refer to [Self-Attentionwith Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155) for more details.
+- the use of a causal depth-wise convolution instead of a non-causal one.
+
+### Generation process
+
+Here's how the generation process works:
+
+- Input text or speech is processed through its specific encoder.
+- A decoder creates text tokens in the desired language.
+- If speech generation is required, the second seq2seq model, generates unit tokens in an non auto-regressive way.
+- These unit tokens are then passed through the final vocoder to produce the actual speech.
+
+
+This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication).
+
+## SeamlessM4Tv2Model
+
+[[autodoc]] SeamlessM4Tv2Model
+    - generate
+
+
+## SeamlessM4Tv2ForTextToSpeech
+
+[[autodoc]] SeamlessM4Tv2ForTextToSpeech
+    - generate
+
+
+## SeamlessM4Tv2ForSpeechToSpeech
+
+[[autodoc]] SeamlessM4Tv2ForSpeechToSpeech
+    - generate
+
+
+## SeamlessM4Tv2ForTextToText
+
+[[autodoc]] transformers.SeamlessM4Tv2ForTextToText
+    - forward
+    - generate
+
+## SeamlessM4Tv2ForSpeechToText
+
+[[autodoc]] transformers.SeamlessM4Tv2ForSpeechToText
+    - forward
+    - generate
+
+## SeamlessM4Tv2Config
+
+[[autodoc]] SeamlessM4Tv2Config
diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md
index 704d05987b9beb..a7e78976cf94fe 100644
--- a/docs/source/en/model_doc/t5.md
+++ b/docs/source/en/model_doc/t5.md
@@ -314,7 +314,7 @@ The predicted tokens will then be placed between the sentinel tokens.
 
 ## Performance
 
-If you'd like a faster training and inference performance, install [apex](https://github.com/NVIDIA/apex#quick-start) and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
+If you'd like a faster training and inference performance, install [NVIDIA APEX](https://github.com/NVIDIA/apex#quick-start) for NVIDIA GPUs, or [ROCm APEX](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs and then the model will automatically use `apex.normalization.FusedRMSNorm` instead of `T5LayerNorm`. The former uses an optimized fused kernel which is several times faster than the latter.
 
 
 ## Resources
diff --git a/docs/source/en/model_doc/transfo-xl.md b/docs/source/en/model_doc/transfo-xl.md
index d75e3a37b990cf..dae7e532be66f3 100644
--- a/docs/source/en/model_doc/transfo-xl.md
+++ b/docs/source/en/model_doc/transfo-xl.md
@@ -16,6 +16,35 @@ rendered properly in your Markdown viewer.
 
 # Transformer XL
 
+<Tip warning={true}>
+
+This model is in maintenance mode only, so we won't accept any new PRs changing its code. This model was deprecated due to security issues linked to `pickle.load`.
+
+We recommend switching to more recent models for improved security.
+
+In case you would still like to use `TransfoXL` in your experiments, we recommend using the [Hub checkpoint](https://huggingface.co/transfo-xl-wt103) with a specific revision to ensure you are downloading safe files from the Hub.
+
+You will need to set the environment variable `TRUST_REMOTE_CODE` to `True` in order to allow the
+usage of `pickle.load()`:
+
+```python
+import os
+from transformers import TransfoXLTokenizer, TransfoXLLMHeadModel
+
+os.environ["TRUST_REMOTE_CODE"] = "True"
+
+checkpoint = 'transfo-xl-wt103'
+revision = '40a186da79458c9f9de846edfaea79c412137f97'
+
+tokenizer = TransfoXLTokenizer.from_pretrained(checkpoint, revision=revision)
+model = TransfoXLLMHeadModel.from_pretrained(checkpoint, revision=revision)
+```
+
+If you run into any issues running this model, please reinstall the last version that supported this model: v4.35.0.
+You can do so by running the following command: `pip install -U transformers==4.35.0`.
+
+</Tip>
+
 <div class="flex flex-wrap space-x-1">
 <a href="https://huggingface.co/models?filter=transfo-xl">
 <img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
@@ -79,13 +108,13 @@ TransformerXL does **not** work with *torch.nn.DataParallel* due to a bug in PyT
 
 ## TransfoXL specific outputs
 
-[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
+[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
 
-[[autodoc]] models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
 
-[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+[[autodoc]] models.deprecated.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
 
-[[autodoc]] models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+[[autodoc]] models.deprecated.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
 
 <frameworkcontent>
 <pt>
diff --git a/docs/source/en/model_doc/tvp.md b/docs/source/en/model_doc/tvp.md
new file mode 100644
index 00000000000000..22b400a06c736a
--- /dev/null
+++ b/docs/source/en/model_doc/tvp.md
@@ -0,0 +1,186 @@
+<!--Copyright 2023 The Intel Team Authors and HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# TVP
+
+## Overview
+
+The text-visual prompting (TVP) framework was proposed in the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
+
+The abstract from the paper is the following:
+
+*In this paper, we study the problem of temporal video grounding (TVG), which aims to predict the starting/ending time points of moments described by a text sentence within a long untrimmed video. Benefiting from fine-grained 3D visual features, the TVG techniques have achieved remarkable progress in recent years. However, the high complexity of 3D convolutional neural networks (CNNs) makes extracting dense 3D visual features time-consuming, which calls for intensive memory and computing resources. Towards efficient TVG, we propose a novel text-visual prompting (TVP) framework, which incorporates optimized perturbation patterns (that we call ‘prompts’) into both visual inputs and textual features of a TVG model. In sharp contrast to 3D CNNs, we show that TVP allows us to effectively co-train vision encoder and language encoder in a 2D TVG model and improves the performance of cross-modal feature fusion using only low-complexity sparse 2D visual features. Further, we propose a Temporal-Distance IoU (TDIoU) loss for efficient learning of TVG. Experiments on two benchmark datasets, Charades-STA and ActivityNet Captions datasets, empirically show that the proposed TVP significantly boosts the performance of 2D TVG (e.g., 9.79% improvement on Charades-STA and 30.77% improvement on ActivityNet Captions) and achieves 5× inference acceleration over TVG using 3D visual features.*
+
+This research addresses temporal video grounding (TVG), which is the process of pinpointing the start and end times of specific events in a long video, as described by a text sentence. Text-visual prompting (TVP), is proposed to enhance TVG. TVP involves integrating specially designed patterns, known as 'prompts', into both the visual (image-based) and textual (word-based) input components of a TVG model. These prompts provide additional spatial-temporal context, improving the model's ability to accurately determine event timings in the video. The approach employs 2D visual inputs in place of 3D ones. Although 3D inputs offer more spatial-temporal detail, they are also more time-consuming to process. The use of 2D inputs with the prompting method aims to provide similar levels of context and accuracy more efficiently.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/tvp_architecture.png"
+alt="drawing" width="600"/>
+
+<small> TVP architecture. Taken from the <a href="https://arxiv.org/abs/2303.04995">original paper.</a> </small>
+
+This model was contributed by [Jiqing Feng](https://huggingface.co/Jiqing). The original code can be found [here](https://github.com/intel/TVP).
+
+## Usage tips and examples
+
+Prompts are optimized perturbation patterns, which would be added to input video frames or text features. Universal set refers to using the same exact set of prompts for any input, this means that these prompts are added consistently to all video frames and text features, regardless of the input's content.
+
+TVP consists of a visual encoder and cross-modal encoder. A universal set of visual prompts and text prompts to be integrated into sampled video frames and textual features, respectively. Specially, a set of different visual prompts are applied to uniformly-sampled frames of one untrimmed video in order.
+
+The goal of this model is to incorporate trainable prompts into both visual inputs and textual features to temporal video grounding(TVG) problems.
+In principle, one can apply any visual, cross-modal encoder in the proposed architecture.
+
+The [`TvpProcessor`] wraps [`BertTokenizer`] and [`TvpImageProcessor`] into a single instance to both
+encode the text and prepare the images respectively.
+
+The following example shows how to run temporal video grounding using [`TvpProcessor`] and [`TvpForVideoGrounding`].
+```python
+import av
+import cv2
+import numpy as np
+import torch
+from huggingface_hub import hf_hub_download
+from transformers import AutoProcessor, TvpForVideoGrounding
+
+
+def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
+    '''
+    Convert the video from its original fps to the target_fps and decode the video with PyAV decoder.
+    Args:
+        container (container): pyav container.
+        sampling_rate (int): frame sampling rate (interval between two sampled frames).
+        num_frames (int): number of frames to sample.
+        clip_idx (int): if clip_idx is -1, perform random temporal sampling.
+            If clip_idx is larger than -1, uniformly split the video to num_clips
+            clips, and select the clip_idx-th video clip.
+        num_clips (int): overall number of clips to uniformly sample from the given video.
+        target_fps (int): the input video may have different fps, convert it to
+            the target video fps before frame sampling.
+    Returns:
+        frames (tensor): decoded frames from the video. Return None if the no
+            video stream was found.
+        fps (float): the number of frames per second of the video.
+    '''
+    video = container.streams.video[0]
+    fps = float(video.average_rate)
+    clip_size = sampling_rate * num_frames / target_fps * fps
+    delta = max(num_frames - clip_size, 0)
+    start_idx = delta * clip_idx / num_clips
+    end_idx = start_idx + clip_size - 1
+    timebase = video.duration / num_frames
+    video_start_pts = int(start_idx * timebase)
+    video_end_pts = int(end_idx * timebase)
+    seek_offset = max(video_start_pts - 1024, 0)
+    container.seek(seek_offset, any_frame=False, backward=True, stream=video)
+    frames = {}
+    for frame in container.decode(video=0):
+        if frame.pts < video_start_pts:
+            continue
+        frames[frame.pts] = frame
+        if frame.pts > video_end_pts:
+            break
+    frames = [frames[pts] for pts in sorted(frames)]
+    return frames, fps
+
+
+def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps):
+    '''
+    Decode the video and perform temporal sampling.
+    Args:
+        container (container): pyav container.
+        sampling_rate (int): frame sampling rate (interval between two sampled frames).
+        num_frames (int): number of frames to sample.
+        clip_idx (int): if clip_idx is -1, perform random temporal sampling.
+            If clip_idx is larger than -1, uniformly split the video to num_clips
+            clips, and select the clip_idx-th video clip.
+        num_clips (int): overall number of clips to uniformly sample from the given video.
+        target_fps (int): the input video may have different fps, convert it to
+            the target video fps before frame sampling.
+    Returns:
+        frames (tensor): decoded frames from the video.
+    '''
+    assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx)
+    frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps)
+    clip_size = sampling_rate * num_frames / target_fps * fps
+    index = np.linspace(0, clip_size - 1, num_frames)
+    index = np.clip(index, 0, len(frames) - 1).astype(np.int64)
+    frames = np.array([frames[idx].to_rgb().to_ndarray() for idx in index])
+    frames = frames.transpose(0, 3, 1, 2)
+    return frames
+
+
+file = hf_hub_download(repo_id="Intel/tvp_demo", filename="AK2KG.mp4", repo_type="dataset")
+model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base")
+
+decoder_kwargs = dict(
+    container=av.open(file, metadata_errors="ignore"),
+    sampling_rate=1,
+    num_frames=model.config.num_frames,
+    clip_idx=0,
+    num_clips=1,
+    target_fps=3,
+)
+raw_sampled_frms = decode(**decoder_kwargs)
+
+text = "a person is sitting on a bed."
+processor = AutoProcessor.from_pretrained("Intel/tvp-base")
+model_inputs = processor(
+    text=[text], videos=list(raw_sampled_frms), return_tensors="pt", max_text_length=100#, size=size
+)
+
+model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype)
+output = model(**model_inputs)
+
+def get_video_duration(filename):
+    cap = cv2.VideoCapture(filename)
+    if cap.isOpened():
+        rate = cap.get(5)
+        frame_num = cap.get(7)
+        duration = frame_num/rate
+        return duration
+    return -1
+
+duration = get_video_duration(file)
+start, end = processor.post_process_video_grounding(output.logits, duration)
+
+print(f"The time slot of the video corresponding to the text \"{text}\" is from {start}s to {end}s")
+```
+
+Tips:
+
+- This implementation of TVP uses [`BertTokenizer`] to generate text embeddings and Resnet-50 model to compute visual embeddings.
+- Checkpoints for pre-trained [tvp-base](https://huggingface.co/Intel/tvp-base) is released.
+- Please refer to [Table 2](https://arxiv.org/pdf/2303.04995.pdf) for TVP's performance on Temporal Video Grounding task.
+
+
+## TvpConfig
+
+[[autodoc]] TvpConfig
+
+## TvpImageProcessor
+
+[[autodoc]] TvpImageProcessor
+    - preprocess
+
+## TvpProcessor
+
+[[autodoc]] TvpProcessor
+    - __call__
+
+## TvpModel
+
+[[autodoc]] TvpModel
+    - forward
+
+## TvpForVideoGrounding
+
+[[autodoc]] TvpForVideoGrounding
+    - forward
diff --git a/docs/source/en/model_doc/univnet.md b/docs/source/en/model_doc/univnet.md
new file mode 100644
index 00000000000000..45bd94732773ae
--- /dev/null
+++ b/docs/source/en/model_doc/univnet.md
@@ -0,0 +1,80 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# UnivNet
+
+## Overview
+
+The UnivNet model was proposed in [UnivNet: A Neural Vocoder with Multi-Resolution Spectrogram Discriminators for High-Fidelity Waveform Generation](https://arxiv.org/abs/2106.07889) by Won Jang, Dan Lim, Jaesam Yoon, Bongwan Kin, and Juntae Kim.
+The UnivNet model is a generative adversarial network (GAN) trained to synthesize high fidelity speech waveforms. The UnivNet model shared in `transformers` is the *generator*, which maps a conditioning log-mel spectrogram and optional noise sequence to a speech waveform (e.g. a vocoder). Only the generator is required for inference. The *discriminator* used to train the `generator` is not implemented.
+
+The abstract from the paper is the following:
+
+*Most neural vocoders employ band-limited mel-spectrograms to generate waveforms. If full-band spectral features are used as the input, the vocoder can be provided with as much acoustic information as possible. However, in some models employing full-band mel-spectrograms, an over-smoothing problem occurs as part of which non-sharp spectrograms are generated. To address this problem, we propose UnivNet, a neural vocoder that synthesizes high-fidelity waveforms in real time. Inspired by works in the field of voice activity detection, we added a multi-resolution spectrogram discriminator that employs multiple linear spectrogram magnitudes computed using various parameter sets. Using full-band mel-spectrograms as input, we expect to generate high-resolution signals by adding a discriminator that employs spectrograms of multiple resolutions as the input. In an evaluation on a dataset containing information on hundreds of speakers, UnivNet obtained the best objective and subjective results among competing models for both seen and unseen speakers. These results, including the best subjective score for text-to-speech, demonstrate the potential for fast adaptation to new speakers without a need for training from scratch.*
+
+Tips:
+
+- The `noise_sequence` argument for [`UnivNetModel.forward`] should be standard Gaussian noise (such as from `torch.randn`) of shape `([batch_size], noise_length, model.config.model_in_channels)`, where `noise_length` should match the length dimension (dimension 1) of the `input_features` argument. If not supplied, it will be randomly generated; a `torch.Generator` can be supplied to the `generator` argument so that the forward pass can be reproduced. (Note that [`UnivNetFeatureExtractor`] will return generated noise by default, so it shouldn't be necessary to generate `noise_sequence` manually.)
+- Padding added by [`UnivNetFeatureExtractor`] can be removed from the [`UnivNetModel`] output through the [`UnivNetFeatureExtractor.batch_decode`] method, as shown in the usage example below.
+- Padding the end of each waveform with silence can reduce artifacts at the end of the generated audio sample. This can be done by supplying `pad_end = True` to [`UnivNetFeatureExtractor.__call__`]. See [this issue](https://github.com/seungwonpark/melgan/issues/8) for more details.
+
+Usage Example:
+
+```python
+import torch
+from scipy.io.wavfile import write
+from datasets import Audio, load_dataset
+
+from transformers import UnivNetFeatureExtractor, UnivNetModel
+
+model_id_or_path = "dg845/univnet-dev"
+model = UnivNetModel.from_pretrained(model_id_or_path)
+feature_extractor = UnivNetFeatureExtractor.from_pretrained(model_id_or_path)
+
+ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+# Resample the audio to the model and feature extractor's sampling rate.
+ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+# Pad the end of the converted waveforms to reduce artifacts at the end of the output audio samples.
+inputs = feature_extractor(
+    ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], pad_end=True, return_tensors="pt"
+)
+
+with torch.no_grad():
+    audio = model(**inputs)
+
+# Remove the extra padding at the end of the output.
+audio = feature_extractor.batch_decode(**audio)[0]
+# Convert to wav file
+write("sample_audio.wav", feature_extractor.sampling_rate, audio)
+```
+
+This model was contributed by [dg845](https://huggingface.co/dg845).
+To the best of my knowledge, there is no official code release, but an unofficial implementation can be found at [maum-ai/univnet](https://github.com/maum-ai/univnet) with pretrained checkpoints [here](https://github.com/maum-ai/univnet#pre-trained-model).
+
+
+## UnivNetConfig
+
+[[autodoc]] UnivNetConfig
+
+## UnivNetFeatureExtractor
+
+[[autodoc]] UnivNetFeatureExtractor
+    - __call__
+
+## UnivNetModel
+
+[[autodoc]] UnivNetModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/en/model_doc/vipllava.md b/docs/source/en/model_doc/vipllava.md
new file mode 100644
index 00000000000000..c5f3c5f55f2c56
--- /dev/null
+++ b/docs/source/en/model_doc/vipllava.md
@@ -0,0 +1,61 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# VipLlava
+
+## Overview
+
+The VipLlava model was proposed in [Making Large Multimodal Models Understand Arbitrary Visual Prompts](https://arxiv.org/abs/2312.00784) by Mu Cai, Haotian Liu, Siva Karthik Mustikovela, Gregory P. Meyer, Yuning Chai, Dennis Park, Yong Jae Lee.
+
+VipLlava enhances the training protocol of Llava by marking images and interact with the model using natural cues like a "red bounding box" or "pointed arrow" during training.
+
+The abstract from the paper is the following:
+
+*While existing large vision-language multimodal models focus on whole image understanding, there is a prominent gap in achieving region-specific comprehension. Current approaches that use textual coordinates or spatial encodings often fail to provide a user-friendly interface for visual prompting. To address this challenge, we introduce a novel multimodal model capable of decoding arbitrary visual prompts. This allows users to intuitively mark images and interact with the model using natural cues like a "red bounding box" or "pointed arrow". Our simple design directly overlays visual markers onto the RGB image, eliminating the need for complex region encodings, yet achieves state-of-the-art performance on region-understanding tasks like Visual7W, PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present ViP-Bench, a comprehensive benchmark to assess the capability of models in understanding visual prompts across multiple dimensions, enabling future research in this domain. Code, data, and model are publicly available.*
+
+Tips:
+
+- The architecture is similar than llava architecture except that the multi-modal projector takes a set of concatenated vision hidden states and has an additional layernorm layer on that module.
+
+- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
+
+- Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results.
+
+- For better results, we recommend users to prompt the model with the correct prompt format: 
+
+```bash
+"USER: <image>\n<prompt>ASSISTANT:"
+```
+
+For multiple turns conversation:
+
+```bash
+"USER: <image>\n<prompt1>ASSISTANT: <answer1>USER: <prompt2>ASSISTANT: <answer2>USER: <prompt3>ASSISTANT:"
+```
+
+The original code can be found [here](https://github.com/mu-cai/ViP-LLaVA).
+
+This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada)
+
+
+## VipLlavaConfig
+
+[[autodoc]] VipLlavaConfig
+
+## VipLlavaForConditionalGeneration
+
+[[autodoc]] VipLlavaForConditionalGeneration
+    - forward
diff --git a/docs/source/en/model_doc/whisper.md b/docs/source/en/model_doc/whisper.md
index 8d73a5655fdf3f..37411209bf9157 100644
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@@ -34,13 +34,13 @@ The original code can be found [here](https://github.com/openai/whisper).
 - Inference is currently only implemented for short-form i.e. audio is pre-segmented into <=30s segments. Long-form (including timestamps) will be implemented in a future release.
 - One can use [`WhisperProcessor`] to prepare audio for the model, and decode the predicted ID's back into text.
 
-- To convert the tokenizer, we recommend using the following:
+- To convert the model and the processor, we recommend using the following:
 
 ```bash
-python src/transformers/models/whisper/convert_openai_to_hf.py --checkpoint_path "" --pytorch_dump_folder_path "Arthur/whisper-3" --convert_tokenizer True --whisper_version 3 --multilingual True
+python src/transformers/models/whisper/convert_openai_to_hf.py --checkpoint_path "" --pytorch_dump_folder_path "Arthur/whisper-3" --convert_preprocessor True
 ```
-Here the `whisper_version` will set the number of languages to `100` to account for `cantonese` which was added in `whisper-large-v3`.
-
+The script will automatically determine all necessary parameters from the OpenAI checkpoint. A `tiktoken` library needs to be installed
+to perform the conversion of the OpenAI tokenizer to the `tokenizers` version.
 
 ## Inference
 
@@ -75,6 +75,19 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
 ```
 
+## Resources
+
+A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Whisper. If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+
+- A fork with a script to [convert a Whisper model in Hugging Face format to OpenAI format](https://github.com/zuazo-forks/transformers/blob/convert_hf_to_openai/src/transformers/models/whisper/convert_hf_to_openai.py). 🌎
+Usage example:
+```bash
+pip install -U openai-whisper
+python convert_hf_to_openai.py \
+    --checkpoint openai/whisper-tiny \
+    --whisper_dump_path whisper-tiny-openai.pt
+```
+
 ## WhisperConfig
 
 [[autodoc]] WhisperConfig
diff --git a/docs/source/en/pad_truncation.md b/docs/source/en/pad_truncation.md
index 8094dc1bc2aac2..cc623bca48a402 100644
--- a/docs/source/en/pad_truncation.md
+++ b/docs/source/en/pad_truncation.md
@@ -54,7 +54,7 @@ The following table summarizes the recommended way to setup padding and truncati
 |                                      |                                   | `tokenizer(batch_sentences, padding='longest')`                                        |
 |                                      | padding to max model input length | `tokenizer(batch_sentences, padding='max_length')`                                     |
 |                                      | padding to specific length        | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
-|                                      | padding to a multiple of a value  | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)                        |
+|                                      | padding to a multiple of a value  | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)`                        |
 | truncation to max model input length | no padding                        | `tokenizer(batch_sentences, truncation=True)` or                                       |
 |                                      |                                   | `tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
 |                                      | padding to max sequence in batch  | `tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
diff --git a/docs/source/en/perf_hardware.md b/docs/source/en/perf_hardware.md
index a28824346e4b17..18c70e1b30a5c2 100644
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@@ -134,7 +134,7 @@ Here is the full benchmark code and outputs:
 ```bash
 # DDP w/ NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
@@ -143,7 +143,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch
 
 # DDP w/o NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md
index ba339c1a3068fa..21fce43427ab5b 100644
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.
 
 # GPU inference
 
-GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime on Nvidia GPUs.
+GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime on Nvidia and AMD GPUs.
 
 <Tip>
 
@@ -36,15 +36,31 @@ FlashAttention-2 is experimental and may change considerably in future versions.
 1. additionally parallelizing the attention computation over sequence length
 2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
 
-FlashAttention-2 supports inference with Llama, Mistral, Falcon and Bark models. You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
+FlashAttention-2 is currently supported for the following architectures:
+* [Bark](https://huggingface.co/docs/transformers/model_doc/bark#transformers.BarkModel)
+* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
+* [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
+* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
+* [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel)
+* [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
+* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
+* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
+* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
+* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
+* [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
+* [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
+* [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
+* [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
+* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 
-Before you begin, make sure you have FlashAttention-2 installed (see the [installation](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) guide for more details about prerequisites):
+You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
 
-```bash
-pip install flash-attn --no-build-isolation
-```
+Before you begin, make sure you have FlashAttention-2 installed. For NVIDIA GPUs, the library is installable through pip: `pip install flash-attn --no-build-isolation`. We strongly suggest to refer to the [detailed installation instructions](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features).
+
+FlashAttention-2 is also supported on AMD GPUs, with the current support limited to **Instinct MI210 and Instinct MI250**. We strongly suggest to use the following [Dockerfile](https://github.com/huggingface/optimum-amd/tree/main/docker/transformers-pytorch-amd-gpu-flash/Dockerfile) to use FlashAttention-2 on AMD GPUs.
 
-To enable FlashAttention-2, add the `use_flash_attention_2` parameter to [`~AutoModelForCausalLM.from_pretrained`]:
+To enable FlashAttention-2, pass the argument `attn_implementation="flash_attention_2"` to [`~AutoModelForCausalLM.from_pretrained`]:
 
 ```python
 import torch
@@ -56,13 +72,15 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     torch_dtype=torch.bfloat16, 
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 ```
 
 <Tip>
 
-FlashAttention-2 can only be used when the model's dtype is  `fp16` or `bf16`, and it only runs on Nvidia GPUs. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
+FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
+
+Note that `use_flash_attention_2=True` can also be used to enable Flash Attention 2, but is deprecated in favor of `attn_implementation="flash_attention_2"`.
   
 </Tip>
 
@@ -79,14 +97,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     load_in_8bit=True,
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 
 # load in 4bit
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     load_in_4bit=True,
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 ```
 
@@ -126,41 +144,21 @@ FlashAttention is more memory efficient, meaning you can train on much larger se
 <img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
 </div>
 
-## BetterTransformer
-
-<Tip>
-
-Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
-
-</Tip>
-
-BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
-
-1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
-2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
-
-BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention (SDPA)](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention), and it calls optimized kernels like [FlashAttention](https://huggingface.co/papers/2205.14135) under the hood.
+## FlashAttention and memory-efficient attention through PyTorch's scaled_dot_product_attention 
 
-Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).
+PyTorch's [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. SDPA support is currently being added natively in Transformers, and is used by default for `torch>=2.1.1` when an implementation is available.
 
-Then you can enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
+For now, Transformers supports inference and training through SDPA for the following architectures:
+* [Bart](https://huggingface.co/docs/transformers/model_doc/bart#transformers.BartModel)
+* [GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode#transformers.GPTBigCodeModel)
+* [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
+* [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
+* [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
+* [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 
-```python
-model = model.to_bettertransformer()
-```
+Note that FlashAttention can only be used for models with the `fp16` or `bf16` torch type, so make sure to cast your model to the appropriate type before using it.
 
-You can return the original Transformers model with the [`~PreTrainedModel.reverse_bettertransformer`] method. You should use this before saving your model to use the canonical Transformers modeling:
-
-```py
-model = model.reverse_bettertransformer()
-model.save_pretrained("saved_model")
-```
-
-### FlashAttention
-
-SDPA can also call FlashAttention kernels under the hood. FlashAttention can only be used for models using the `fp16` or `bf16` dtype, so make sure to cast your model to the appropriate dtype before using it.
-
-To enable FlashAttention or to check whether it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+By default, `torch.nn.functional.scaled_dot_product_attention` selects the most performant kernel available, but to check whether a backend is available in a given setting (hardware, problem size), you can use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
 
 ```diff
 import torch
@@ -189,11 +187,48 @@ RuntimeError: No available kernel. Aborting execution.
 pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
 ```
 
+## BetterTransformer
+
+<Tip warning={true}>
+
+Part of BetterTransformer features are being upstreamed in Transformers, with native `torch.nn.scaled_dot_product_attention` default support. BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to support natively SDPA in Transformers.
+
+</Tip>
+
+
+<Tip>
+
+Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
+
+</Tip>
+
+BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
+
+1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
+2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
+
+BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention (SDPA)](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention), and it calls optimized kernels like [FlashAttention](https://huggingface.co/papers/2205.14135) under the hood.
+
+Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).
+
+Then you can enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:
+
+```python
+model = model.to_bettertransformer()
+```
+
+You can return the original Transformers model with the [`~PreTrainedModel.reverse_bettertransformer`] method. You should use this before saving your model to use the canonical Transformers modeling:
+
+```py
+model = model.reverse_bettertransformer()
+model.save_pretrained("saved_model")
+```
+
 ## bitsandbytes
 
 bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory.
 
-Make sure you have bitsnbytes and 🤗 Accelerate installed:
+Make sure you have bitsandbytes and 🤗 Accelerate installed:
 
 ```bash
 # these versions support 8-bit and 4-bit
@@ -276,13 +311,13 @@ Feel free to try running a 11 billion parameter [T5 model](https://colab.researc
 
 <Tip>
 
-Learn more details about using ORT with 🤗 Optimum in the [Accelerated inference on NVIDIA GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) guide. This section only provides a brief and simple example.
+Learn more details about using ORT with 🤗 Optimum in the [Accelerated inference on NVIDIA GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) and [Accelerated inference on AMD GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu#accelerated-inference-on-amd-gpus) guides. This section only provides a brief and simple example.
 
 </Tip>
 
-ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs. ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
+ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs, and AMD GPUs that use [ROCm](https://www.amd.com/en/products/software/rocm.html) stack. ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
 
-ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. You'll need to use an [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and specify the `provider` parameter which can be set to either [`CUDAExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#cudaexecutionprovider) or [`TensorrtExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider). If you want to load a model that was not yet exported to ONNX, you can set `export=True` to convert your model on-the-fly to the ONNX format :
+ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. You'll need to use an [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and specify the `provider` parameter which can be set to either [`CUDAExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#cudaexecutionprovider), [`ROCMExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/amdgpu) or [`TensorrtExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider). If you want to load a model that was not yet exported to ONNX, you can set `export=True` to convert your model on-the-fly to the ONNX format:
 
 ```py
 from optimum.onnxruntime import ORTModelForSequenceClassification
diff --git a/docs/source/en/perf_train_cpu_many.md b/docs/source/en/perf_train_cpu_many.md
index 4c131430babdae..4c5ffa35cdb64b 100644
--- a/docs/source/en/perf_train_cpu_many.md
+++ b/docs/source/en/perf_train_cpu_many.md
@@ -15,7 +15,8 @@ rendered properly in your Markdown viewer.
 
 # Efficient Training on Multiple CPUs
 
-When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling distributed CPU training efficiently.
+When training on a single CPU is too slow, we can use multiple CPUs. This guide focuses on PyTorch-based DDP enabling
+distributed CPU training efficiently on [bare metal](#usage-in-trainer) and [Kubernetes](#usage-with-kubernetes).
 
 ## Intel® oneCCL Bindings for PyTorch
 
@@ -25,7 +26,7 @@ Module `oneccl_bindings_for_pytorch` (`torch_ccl` before version 1.12)  implemen
 
 Check more detailed information for [oneccl_bind_pt](https://github.com/intel/torch-ccl).
 
-### Intel® oneCCL Bindings for PyTorch installation:
+### Intel® oneCCL Bindings for PyTorch installation
 
 Wheel files are available for the following Python versions:
 
@@ -68,9 +69,9 @@ torch_ccl_path=$(python -c "import torch; import torch_ccl; import os;  print(os
 source $torch_ccl_path/env/setvars.sh
 ```
 
-#### IPEX installation:
+#### Intel® Extension for PyTorch installation
 
-IPEX provides performance optimizations for CPU training with both Float32 and BFloat16, you could refer [single CPU section](./perf_train_cpu).
+Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the [single CPU section](./perf_train_cpu) to learn more).
 
 
 The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example.
@@ -132,3 +133,185 @@ Now, run the following command in node0 and **4DDP** will be enabled in node0 an
  --use_ipex \
  --bf16
 ```
+
+## Usage with Kubernetes
+
+The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the
+[Kubeflow PyTorchJob training operator](https://www.kubeflow.org/docs/components/training/pytorch/).
+
+### Setup
+
+This example assumes that you have:
+* Access to a Kubernetes cluster with [Kubeflow installed](https://www.kubeflow.org/docs/started/installing-kubeflow/)
+* [`kubectl`](https://kubernetes.io/docs/tasks/tools/) installed and configured to access the Kubernetes cluster
+* A [Persistent Volume Claim (PVC)](https://kubernetes.io/docs/concepts/storage/persistent-volumes/) that can be used
+  to store datasets and model files. There are multiple options for setting up the PVC including using an NFS
+  [storage class](https://kubernetes.io/docs/concepts/storage/storage-classes/) or a cloud storage bucket.
+* A Docker container that includes your model training script and all the dependencies needed to run the script. For
+  distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel
+  oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers.
+
+The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then
+extracts a Transformers release to the `/workspace` directory, so that the example scripts are included in the image:
+```
+FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9
+
+WORKDIR /workspace
+
+# Download and extract the transformers code
+ARG HF_TRANSFORMERS_VER="4.35.2"
+RUN mkdir transformers && \
+    curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf -
+```
+The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the
+PyTorchJob to the cluster.
+
+### PyTorchJob Specification File
+
+The [Kubeflow PyTorchJob](https://www.kubeflow.org/docs/components/training/pytorch/) is used to run the distributed
+training job on the cluster. The yaml file for the PyTorchJob defines parameters such as:
+ * The name of the PyTorchJob
+ * The number of replicas (workers)
+ * The python script and it's parameters that will be used to run the training job
+ * The types of resources (node selector, memory, and CPU) needed for each worker
+ * The image/tag for the Docker container to use
+ * Environment variables
+ * A volume mount for the PVC
+
+The volume mount defines a path where the PVC will be mounted in the container for each worker pod. This location can be
+used for the dataset, checkpoint files, and the saved model after training completes.
+
+The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the
+[question-answering example](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering).
+```yaml
+apiVersion: "kubeflow.org/v1"
+kind: PyTorchJob
+metadata:
+  name: transformers-pytorchjob
+  namespace: kubeflow
+spec:
+  elasticPolicy:
+    rdzvBackend: c10d
+    minReplicas: 1
+    maxReplicas: 4
+    maxRestarts: 10
+  pytorchReplicaSpecs:
+    Worker:
+      replicas: 4  # The number of worker pods
+      restartPolicy: OnFailure
+      template:
+        spec:
+          containers:
+            - name: pytorch
+              image: <image name>:<tag>  # Specify the docker image to use for the worker pods
+              imagePullPolicy: IfNotPresent
+              command:
+                - torchrun
+                - /workspace/transformers/examples/pytorch/question-answering/run_qa.py
+                - --model_name_or_path
+                - "bert-large-uncased"
+                - --dataset_name
+                - "squad"
+                - --do_train
+                - --do_eval
+                - --per_device_train_batch_size
+                - "12"
+                - --learning_rate
+                - "3e-5"
+                - --num_train_epochs
+                - "2"
+                - --max_seq_length
+                - "384"
+                - --doc_stride
+                - "128"
+                - --output_dir
+                - "/tmp/pvc-mount/output"
+                - --no_cuda
+                - --ddp_backend
+                - "ccl"
+                - --use_ipex
+                - --bf16  # Specify --bf16 if your hardware supports bfloat16
+              env:
+              - name: LD_PRELOAD
+                value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so"
+              - name: TRANSFORMERS_CACHE
+                value: "/tmp/pvc-mount/transformers_cache"
+              - name: HF_DATASETS_CACHE
+                value: "/tmp/pvc-mount/hf_datasets_cache"
+              - name: LOGLEVEL
+                value: "INFO"
+              - name: CCL_WORKER_COUNT
+                value: "1"
+              - name: OMP_NUM_THREADS  # Can be tuned for optimal performance
+-                value: "56"
+              resources:
+                limits:
+                  cpu: 200  # Update the CPU and memory limit values based on your nodes
+                  memory: 128Gi
+                requests:
+                  cpu: 200  # Update the CPU and memory request values based on your nodes
+                  memory: 128Gi
+              volumeMounts:
+              - name: pvc-volume
+                mountPath: /tmp/pvc-mount
+              - mountPath: /dev/shm
+                name: dshm
+          restartPolicy: Never
+          nodeSelector:  #  Optionally use the node selector to specify what types of nodes to use for the workers
+            node-type: spr
+          volumes:
+          - name: pvc-volume
+            persistentVolumeClaim:
+              claimName: transformers-pvc
+          - name: dshm
+            emptyDir:
+              medium: Memory
+```
+To run this example, update the yaml based on your training script and the nodes in your cluster.
+
+<Tip>
+
+The CPU resource limits/requests in the yaml are defined in [cpu units](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-cpu)
+where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical
+host or a VM). The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of
+available CPU/memory capacity on a single machine. It is usually a good idea to not use the entire machine's capacity in
+order to leave some resources for the kubelet and OS. In order to get ["guaranteed"](https://kubernetes.io/docs/concepts/workloads/pods/pod-qos/#guaranteed)
+[quality of service](https://kubernetes.io/docs/tasks/configure-pod-container/quality-service-pod/) for the worker pods,
+set the same CPU and memory amounts for both the resource limits and requests.
+
+</Tip>
+
+### Deploy
+
+After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed
+to the cluster using:
+```
+kubectl create -f pytorchjob.yaml
+```
+
+The `kubectl get pods -n kubeflow` command can then be used to list the pods in the `kubeflow` namespace. You should see
+the worker pods for the PyTorchJob that was just deployed. At first, they will probably have a status of "Pending" as
+the containers get pulled and created, then the status should change to "Running".
+```
+NAME                                                     READY   STATUS                  RESTARTS          AGE
+...
+transformers-pytorchjob-worker-0                         1/1     Running                 0                 7m37s
+transformers-pytorchjob-worker-1                         1/1     Running                 0                 7m37s
+transformers-pytorchjob-worker-2                         1/1     Running                 0                 7m37s
+transformers-pytorchjob-worker-3                         1/1     Running                 0                 7m37s
+...
+```
+
+The logs for worker can be viewed using `kubectl logs -n kubeflow <pod name>`. Add `-f` to stream the logs, for example:
+```
+kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f
+```
+
+After the training job completes, the trained model can be copied from the PVC or storage location. When you are done
+with the job, the PyTorchJob resource can be deleted from the cluster using `kubectl delete -f pytorchjob.yaml`.
+
+## Summary
+
+This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes
+cluster. Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training
+performance, and can be used as a template to run your own workload on multiple nodes.
diff --git a/docs/source/en/perf_train_gpu_many.md b/docs/source/en/perf_train_gpu_many.md
index 1795782949d1aa..4d89cf341a555d 100644
--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@@ -153,7 +153,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 
 ```
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -164,7 +164,7 @@ python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-
 
 ```
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/en/perf_train_gpu_one.md b/docs/source/en/perf_train_gpu_one.md
index 25117241f78fbc..089c9905cabada 100644
--- a/docs/source/en/perf_train_gpu_one.md
+++ b/docs/source/en/perf_train_gpu_one.md
@@ -237,7 +237,7 @@ You can speedup the training throughput by using Flash Attention 2 integration i
 The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). Adam achieves 
 good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory 
 footprint of the order of the number of model parameters. To remedy this, you can use an alternative optimizer. 
-For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed, `adamw_apex_fused` will give you the 
+For example if you have [NVIDIA/apex](https://github.com/NVIDIA/apex) installed for NVIDIA GPUs, or [ROCmSoftwarePlatform/apex](https://github.com/ROCmSoftwarePlatform/apex) for AMD GPUs, `adamw_apex_fused` will give you the
 fastest training experience among all supported AdamW optimizers.
 
 [`Trainer`] integrates a variety of optimizers that can be used out of box: `adamw_hf`, `adamw_torch`, `adamw_torch_fused`, 
@@ -529,4 +529,4 @@ By default, in training mode, the BetterTransformer integration **drops the mask
 
 </Tip>
 
-Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA.
\ No newline at end of file
+Check out this [blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about acceleration and memory-savings with SDPA.
diff --git a/docs/source/en/preprocessing.md b/docs/source/en/preprocessing.md
index f08808433c269e..743904cc994c09 100644
--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@@ -220,7 +220,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 For audio tasks, you'll need a [feature extractor](main_classes/feature_extractor) to prepare your dataset for the model. The feature extractor is designed to extract features from raw audio data, and convert them into tensors.
 
-Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets:
+Load the [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets:
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -340,7 +340,7 @@ You can use any library you like for image augmentation. For image preprocessing
 
 </Tip>
 
-Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets:
+Load the [food101](https://huggingface.co/datasets/food101) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets:
 
 <Tip>
 
@@ -354,7 +354,7 @@ Use 🤗 Datasets `split` parameter to only load a small sample from the trainin
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) feature:
+Next, take a look at the image with 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) feature:
 
 ```py
 >>> dataset[0]["image"]
@@ -467,7 +467,7 @@ from [`DetrImageProcessor`] and define a custom `collate_fn` to batch images tog
 
 For tasks involving multimodal inputs, you'll need a [processor](main_classes/processors) to prepare your dataset for the model. A processor couples together two processing objects such as as tokenizer and feature extractor.
 
-Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
+Load the [LJ Speech](https://huggingface.co/datasets/lj_speech) dataset (see the 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR):
 
 ```py
 >>> from datasets import load_dataset
diff --git a/docs/source/en/quantization.md b/docs/source/en/quantization.md
new file mode 100644
index 00000000000000..0a471fd423c6f9
--- /dev/null
+++ b/docs/source/en/quantization.md
@@ -0,0 +1,583 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quantization
+
+Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
+
+Transformers supports several quantization schemes to help you run inference with large language models (LLMs) and finetune adapters on quantized models. This guide will show you how to use Activation-aware Weight Quantization (AWQ), AutoGPTQ, and bitsandbytes.
+
+## AWQ
+
+<Tip>
+
+Try AWQ quantization with this [notebook](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
+
+</Tip>
+
+[Activation-aware Weight Quantization (AWQ)](https://hf.co/papers/2306.00978) doesn't quantize all the weights in a model, and instead, it preserves a small percentage of weights that are important for LLM performance. This significantly reduces quantization loss such that you can run models in 4-bit precision without experiencing any performance degradation.
+
+There are several libraries for quantizing models with the AWQ algorithm, such as [llm-awq](https://github.com/mit-han-lab/llm-awq), [autoawq](https://github.com/casper-hansen/AutoAWQ) or [optimum-intel](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc). Transformers supports loading models quantized with the llm-awq and autoawq libraries. This guide will show you how to load models quantized with autoawq, but the processs is similar for llm-awq quantized models.
+
+Make sure you have autoawq installed:
+
+```bash
+pip install autoawq
+```
+
+AWQ-quantized models can be identified by checking the `quantization_config` attribute in the model's [config.json](https://huggingface.co/TheBloke/zephyr-7B-alpha-AWQ/blob/main/config.json) file:
+
+```json
+{
+  "_name_or_path": "/workspace/process/huggingfaceh4_zephyr-7b-alpha/source",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  ...
+  ...
+  ...
+  "quantization_config": {
+    "quant_method": "awq",
+    "zero_point": true,
+    "group_size": 128,
+    "bits": 4,
+    "version": "gemm"
+  }
+}
+```
+
+A quantized model is loaded with the [`~PreTrainedModel.from_pretrained`] method. If you loaded your model on the CPU, make sure to move it to a GPU device first. Use the `device_map` parameter to specify where to place the model:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
+```
+
+Loading an AWQ-quantized model automatically sets other weights to fp16 by default for performance reasons. If you want to load these other weights in a different format, use the `torch_dtype` parameter:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32)
+```
+
+AWQ quantization can also be combined with [FlashAttention-2](perf_infer_gpu_one#flashattention-2) to further accelerate inference:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
+```
+
+### Fused modules
+
+Fused modules offers improved accuracy and performance and it is supported out-of-the-box for AWQ modules for [Llama](https://huggingface.co/meta-llama) and [Mistral](https://huggingface.co/mistralai/Mistral-7B-v0.1) architectures, but you can also fuse AWQ modules for unsupported architectures.
+
+<Tip warning={true}>
+
+Fused modules cannot be combined with other optimization techniques such as FlashAttention-2.
+
+</Tip>
+
+<hfoptions id="fuse">
+<hfoption id="supported architectures">
+
+To enable fused modules for supported architectures, create an [`AwqConfig`] and set the parameters `fuse_max_seq_len` and `do_fuse=True`. The `fuse_max_seq_len` parameter is the total sequence length and it should include the context length and the expected generation length. You can set it to a larger value to be safe.
+
+For example, to fuse the AWQ modules of the [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Mistral-7B-OpenOrca-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    do_fuse=True,
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+</hfoption>
+<hfoption id="unsupported architectures">
+
+For architectures that don't support fused modules yet, you need to create a custom fusing mapping to define which modules need to be fused with the `modules_to_fuse` parameter. For example, to fuse the AWQ modules of the [TheBloke/Yi-34B-AWQ](https://huggingface.co/TheBloke/Yi-34B-AWQ) model.
+
+```python
+import torch
+from transformers import AwqConfig, AutoModelForCausalLM
+
+model_id = "TheBloke/Yi-34B-AWQ"
+
+quantization_config = AwqConfig(
+    bits=4,
+    fuse_max_seq_len=512,
+    modules_to_fuse={
+        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+        "layernorm": ["ln1", "ln2", "norm"],
+        "mlp": ["gate_proj", "up_proj", "down_proj"],
+        "use_alibi": False,
+        "num_attention_heads": 56,
+        "num_key_value_heads": 8,
+        "hidden_size": 7168
+    }
+)
+
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config).to(0)
+```
+
+The parameter `modules_to_fuse` should include:
+
+- `"attention"`: The names of the attention layers to fuse in the following order: query, key, value and output projection layer. If you don't want to fuse these layers, pass an empty list.
+- `"layernorm"`: The names of all the LayerNorm layers you want to replace with a custom fused LayerNorm. If you don't want to fuse these layers, pass an empty list.
+- `"mlp"`: The names of the MLP layers you want to fuse into a single MLP layer in the order: (gate (dense, layer, post-attention) / up / down layers).
+- `"use_alibi"`: If your model uses ALiBi positional embedding.
+- `"num_attention_heads"`: The number of attention heads.
+- `"num_key_value_heads"`: The number of key value heads that should be used to implement Grouped Query Attention (GQA). If `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if `num_key_value_heads=1` the model will use Multi Query Attention (MQA), otherwise GQA is used.
+- `"hidden_size"`: The dimension of the hidden representations.
+
+</hfoption>
+</hfoptions>
+
+## AutoGPTQ
+
+<Tip>
+
+Try GPTQ quantization with PEFT in this [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) and learn more about it's details in this [blog post](https://huggingface.co/blog/gptq-integration)!
+
+</Tip>
+
+The [AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) library implements the GPTQ algorithm, a post-training quantization technique where each row of the weight matrix is quantized independently to find a version of the weights that minimizes the error. These weights are quantized to int4, but they're restored to fp16 on the fly during inference. This can save your memory-usage by 4x because the int4 weights are dequantized in a fused kernel rather than a GPU's global memory, and you can also expect a speedup in inference because using a lower bitwidth takes less time to communicate.
+
+Before you begin, make sure the following libraries are installed:
+
+```bash
+pip install auto-gptq
+pip install git+https://github.com/huggingface/optimum.git
+pip install git+https://github.com/huggingface/transformers.git
+pip install --upgrade accelerate
+```
+
+To quantize a model (currently only supported for text models), you need to create a [`GPTQConfig`] class and set the number of bits to quantize to, a dataset to calibrate the weights for quantization, and a tokenizer to prepare the dataset.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
+
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset="c4", tokenizer=tokenizer)
+```
+
+You could also pass your own dataset as a list of strings, but it is highly recommended to use the same dataset from the GPTQ paper.
+
+```py
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer)
+```
+
+Load a model to quantize and pass the `gptq_config` to the [`~AutoModelForCausalLM.from_pretrained`] method. Set `device_map="auto"` to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization.
+
+```py
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+
+If you're running out of memory because a dataset is too large, disk offloading is not supported. If this is the case, try passing the `max_memory` parameter to allocate the amount of memory to use on your device (GPU and CPU):
+
+```py
+quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config)
+```
+
+<Tip warning={true}>
+
+Depending on your hardware, it can take some time to quantize a model from scratch. It can take ~5 minutes to quantize the [faceboook/opt-350m]() model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists.
+
+</Tip>
+
+Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. Use the [`~PreTrainedModel.push_to_hub`] method to save the [`GPTQConfig`]:
+
+```py
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+You could also save your quantized model locally with the [`~PreTrainedModel.save_pretrained`] method. If the model was quantized with the `device_map` parameter, make sure to move the entire model to a GPU or CPU before saving it. For example, to save the model on a CPU:
+
+```py
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+
+# if quantized with device_map set
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+Reload a quantized model with the [`~PreTrainedModel.from_pretrained`] method, and set `device_map="auto"` to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed.
+
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+### ExLlama
+
+[ExLlama](https://github.com/turboderp/exllama) is a Python/C++/CUDA implementation of the [Llama](model_doc/llama) model that is designed for faster inference with 4-bit GPTQ weights (check out these [benchmarks](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)). The ExLlama kernel is activated by default when you create a [`GPTQConfig`] object. To boost inference speed even further, use the [ExLlamaV2](https://github.com/turboderp/exllamav2) kernels by configuring the `exllama_config` parameter:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, GPTQConfig
+
+gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
+```
+
+<Tip warning={true}>
+
+Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT.
+
+</Tip>
+
+The ExLlama kernels are only supported when the entire model is on the GPU. If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, GPTQConfig
+gptq_config = GPTQConfig(bits=4, use_exllama=False)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config)
+```
+
+## bitsandbytes
+
+[bitsandbytes](https://github.com/TimDettmers/bitsandbytes) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance. 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
+
+To use bitsandbytes, make sure you have the following libraries installed:
+
+<hfoptions id="bnb">
+<hfoption id="8-bit">
+
+```bash
+pip install transformers accelerate bitsandbytes>0.37.0
+```
+
+</hfoption>
+<hfoption id="4-bit">
+
+```bash
+pip install bitsandbytes>=0.39.0
+pip install --upgrade accelerate
+pip install --upgrade transformers
+```
+
+</hfoption>
+</hfoptions>
+
+Now you can quantize a model with the `load_in_8bit` or `load_in_4bit` parameters in the [`~PreTrainedModel.from_pretrained`] method. This works for any model in any modality, as long as it supports loading with Accelerate and contains `torch.nn.Linear` layers.
+
+<hfoptions id="bnb">
+<hfoption id="8-bit">
+
+Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_8bit=True)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
+model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+```
+
+Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. If you have the latest versions, then you can push the 8-bit model to the Hub with the [`~PreTrainedModel.push_to_hub`] method. The quantization config.json file is pushed first, followed by the quantized model weights.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+</hfoption>
+<hfoption id="4-bit">
+
+Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently use the GPUs available:
+
+```py
+from transformers import AutoModelForCausalLM
+
+model_4bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_4bit=True)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
+
+```py
+import torch
+from transformers import AutoModelForCausalLM
+
+model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True, torch_dtype=torch.float32)
+model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+```
+
+Once a model is quantized to 4-bit, you can't push the quantized weights to the Hub.
+
+</hfoption>
+</hfoptions>
+
+<Tip warning={true}>
+
+Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
+
+</Tip>
+
+You can check your memory footprint with the `get_memory_footprint` method:
+
+```py
+print(model.get_memory_footprint())
+```
+
+Quantized models can be loaded from the [`~PreTrainedModel.from_pretrained`] method without needing to specify the `load_in_8bit` or `load_in_4bit` parameters:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
+```
+
+### 8-bit
+
+<Tip>
+
+Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
+
+</Tip>
+
+This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning.
+
+#### Offloading
+
+8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. The weights dispatched to the CPU are actually stored in **float32**, and aren't converted to 8-bit. For example, to enable offloading for the [bigscience/bloom-1b7](https://huggingface.co/bigscience/bloom-1b7) model, start by creating a [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+```
+
+Design a custom device map to fit everything on your GPU except for the `lm_head`, which you'll dispatch to the CPU:
+
+```py
+device_map = {
+    "transformer.word_embeddings": 0,
+    "transformer.word_embeddings_layernorm": 0,
+    "lm_head": "cpu",
+    "transformer.h": 0,
+    "transformer.ln_f": 0,
+}
+```
+
+Now load your model with the custom `device_map` and `quantization_config`:
+
+```py
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+#### Outlier threshold
+
+An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
+
+To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_threshold=10,
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+#### Skip module conversion
+
+For some models, like [Jukebox](model_doc/jukebox), you don't need to quantize every module to 8-bit which can actually cause instability. With Jukebox, there are several `lm_head` modules that should be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_skip_modules=["lm_head"],
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    quantization_config=quantization_config,
+)
+```
+
+#### Finetuning
+
+With the [PEFT](https://github.com/huggingface/peft) library, you can finetune large models like [flan-t5-large](https://huggingface.co/google/flan-t5-large) and [facebook/opt-6.7b](https://huggingface.co/facebook/opt-6.7b) with 8-bit quantization. You don't need to pass the `device_map` parameter for training because it'll automatically load your model on a GPU. However, you can still customize the device map with the `device_map` parameter if you want to (`device_map="auto"` should only be used for inference).
+
+### 4-bit
+
+<Tip>
+
+Try 4-bit quantization in this [notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) and learn more about it's details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
+
+</Tip>
+
+This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
+
+#### Compute data type
+
+To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
+
+```py
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+#### Normal Float 4 (NF4)
+
+NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
+
+```py
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
+
+#### Nested quantization
+
+Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an addition 0.4 bits/parameter. For example, with nested quantization, you can finetune a [Llama-13b](https://huggingface.co/meta-llama/Llama-2-13b) model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps.
+
+```py
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config)
+```
+
+## Optimum
+
+The [Optimum](https://huggingface.co/docs/optimum/index) library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime.
+
+## Benchmarks
+
+To compare the speed, throughput, and latency of each quantization scheme, check the following benchmarks obtained from the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library. The benchmark was run on a NVIDIA A1000 for the [TheBloke/Mistral-7B-v0.1-AWQ](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) and [TheBloke/Mistral-7B-v0.1-GPTQ](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) models. These were also tested against the bitsandbytes quantization methods as well as a native fp16 model.
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png" alt="forward peak memory per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">forward peak memory/batch size</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png" alt="generate peak memory per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generate peak memory/batch size</figcaption>
+  </div>
+</div>
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png" alt="generate throughput per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png" alt="forward latency per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">forward latency/batch size</figcaption>
+  </div>
+</div>
+
+The benchmarks indicate AWQ quantization is the fastest for inference, text generation, and has the lowest peak memory for text generation. However, AWQ has the largest forward latency per batch size. For a more detailed discussion about the pros and cons of each quantization method, read the [Overview of natively supported quantization schemes in 🤗 Transformers](https://huggingface.co/blog/overview-quantization-transformers) blog post.
+
+### Fused AWQ modules
+
+The [TheBloke/Mistral-7B-OpenOrca-AWQ](https://huggingface.co/TheBloke/Mistral-7B-OpenOrca-AWQ) model was benchmarked with `batch_size=1` with and without fused modules.
+
+<figcaption class="text-center text-gray-500 text-lg">Unfused module</figcaption>
+|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            60.0984 |           38.4537 | 4.50 GB (5.68%) |
+|            1 |               64 |              64 |          1333.67   |           31.6604 | 4.50 GB (5.68%) |
+|            1 |              128 |             128 |          2434.06   |           31.6272 | 4.50 GB (5.68%) |
+|            1 |              256 |             256 |          3072.26   |           38.1731 | 4.50 GB (5.68%) |
+|            1 |              512 |             512 |          3184.74   |           31.6819 | 4.59 GB (5.80%) |
+|            1 |             1024 |            1024 |          3148.18   |           36.8031 | 4.81 GB (6.07%) |
+|            1 |             2048 |            2048 |          2927.33   |           35.2676 | 5.73 GB (7.23%) |
+
+<figcaption class="text-center text-gray-500 text-lg">Fused module</figcaption>
+|   Batch Size |   Prefill Length |   Decode Length |   Prefill tokens/s |   Decode tokens/s | Memory (VRAM)   |
+|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------|
+|            1 |               32 |              32 |            81.4899 |           80.2569 | 4.00 GB (5.05%) |
+|            1 |               64 |              64 |          1756.1    |          106.26   | 4.00 GB (5.05%) |
+|            1 |              128 |             128 |          2479.32   |          105.631  | 4.00 GB (5.06%) |
+|            1 |              256 |             256 |          1813.6    |           85.7485 | 4.01 GB (5.06%) |
+|            1 |              512 |             512 |          2848.9    |           97.701  | 4.11 GB (5.19%) |
+|            1 |             1024 |            1024 |          3044.35   |           87.7323 | 4.41 GB (5.57%) |
+|            1 |             2048 |            2048 |          2715.11   |           89.4709 | 5.57 GB (7.04%) |
+
+The speed and throughput of fused and unfused modules were also tested with the [optimum-benchmark](https://github.com/huggingface/optimum-benchmark) library.
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_forward_memory_plot.png" alt="generate throughput per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">foward peak memory/batch size</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/fused_generate_throughput_plot.png" alt="forward latency per batch size" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generate throughput/batch size</figcaption>
+  </div>
+</div>
diff --git a/docs/source/en/run_scripts.md b/docs/source/en/run_scripts.md
index 3b40b6ea067271..0652bb1da5e4a7 100644
--- a/docs/source/en/run_scripts.md
+++ b/docs/source/en/run_scripts.md
@@ -130,7 +130,7 @@ The [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) sup
 - Set the number of GPUs to use with the `nproc_per_node` argument.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/en/tasks/idefics.md b/docs/source/en/tasks/idefics.md
index 376ec8b308b0d9..da95d74edcec74 100644
--- a/docs/source/en/tasks/idefics.md
+++ b/docs/source/en/tasks/idefics.md
@@ -109,7 +109,6 @@ on the fly while loading.
 Now that you have the model loaded in one of the suggested ways, let's move on to exploring tasks that you can use IDEFICS for.
 
 ## Image captioning
-
 Image captioning is the task of predicting a caption for a given image. A common application is to aid visually impaired 
 people navigate through different situations, for instance, explore image content online. 
 
@@ -229,7 +228,7 @@ Let's get a new image for this task:
      <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"/>
 </div>
 
-Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos).
+Photo by [Jarritos Mexican Soda](https://unsplash.com/@jarritos). 
 
 You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: 
 
diff --git a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
index d06b64fbc5a87d..8448e53011494c 100644
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@@ -61,8 +61,8 @@ import torch.nn.functional as F
 
 
 class ImageDistilTrainer(Trainer):
-    def __init__(self, *args, teacher_model=None, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None,  *args, **kwargs):
+        super().__init__(model=student_model, *args, **kwargs)
         self.teacher = teacher_model
         self.student = student_model
         self.loss_function = nn.KLDivLoss(reduction="batchmean")
@@ -164,7 +164,7 @@ trainer = ImageDistilTrainer(
     train_dataset=processed_datasets["train"],
     eval_dataset=processed_datasets["validation"],
     data_collator=data_collator,
-    tokenizer=teacher_extractor,
+    tokenizer=teacher_processor,
     compute_metrics=compute_metrics,
     temperature=5,
     lambda_param=0.5
diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md
index 0b9c248702199b..a50555dfcf941a 100644
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
 
 
 
@@ -110,7 +110,7 @@ The next step is to load a DistilGPT2 tokenizer to process the `text` subfield:
 ```
 
 You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to
-extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:
+extract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process#flatten) method:
 
 ```py
 >>> eli5 = eli5.flatten()
diff --git a/docs/source/en/tasks/masked_language_modeling.md b/docs/source/en/tasks/masked_language_modeling.md
index ba1e9e50dbe80c..e716447b83bbe8 100644
--- a/docs/source/en/tasks/masked_language_modeling.md
+++ b/docs/source/en/tasks/masked_language_modeling.md
@@ -105,7 +105,7 @@ For masked language modeling, the next step is to load a DistilRoBERTa tokenizer
 ```
 
 You'll notice from the example above, the `text` field is actually nested inside `answers`. This means you'll need to e
-xtract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) method:
+xtract the `text` subfield from its nested structure with the [`flatten`](https://huggingface.co/docs/datasets/process#flatten) method:
 
 ```py
 >>> eli5 = eli5.flatten()
diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md
index 7511ee66dd0b99..6f53655e7cbc76 100644
--- a/docs/source/en/tasks/object_detection.md
+++ b/docs/source/en/tasks/object_detection.md
@@ -512,7 +512,7 @@ Finally, load the metrics and run the evaluation.
 ...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
 
 ...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
-...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
 
 ...         module.add(prediction=results, reference=labels)
 ...         del batch
diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md
index 2895a1977721d4..f422fef9aeb566 100644
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@@ -14,29 +14,17 @@ rendered properly in your Markdown viewer.
 
 -->
 
-# Semantic segmentation
+# Image Segmentation
 
 [[open-in-colab]]
 
 <Youtube id="dKE8SIt9C-w"/>
 
-Semantic segmentation assigns a label or class to each individual pixel of an image. There are several types of segmentation, and in the case of semantic segmentation, no distinction is made between unique instances of the same object. Both objects are given the same label (for example, "car" instead of "car-1" and "car-2"). Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery.
+Image segmentation models separate areas corresponding to different areas of interest in an image. These models work by assigning a label to each pixel. There are several types of segmentation: semantic segmentation, instance segmentation, and panoptic segmentation.
 
-This guide will show you how to:
-
-1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset.
-2. Use your finetuned model for inference.
-
-<Tip>
-The task illustrated in this tutorial is supported by the following model architectures:
-
-<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-
-[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
-
-<!--End of the generated tip-->
-
-</Tip>
+In this guide, we will:
+1. [Take a look at different types of segmentation](#types-of-segmentation).
+2. [Have an end-to-end fine-tuning example for semantic segmentation](#fine-tuning-a-model-for-segmentation).
 
 Before you begin, make sure you have all the necessary libraries installed:
 
@@ -52,7 +40,178 @@ We encourage you to log in to your Hugging Face account so you can upload and sh
 >>> notebook_login()
 ```
 
-## Load SceneParse150 dataset
+## Types of Segmentation
+
+Semantic segmentation assigns a label or class to every single pixel in an image. Let's take a look at a semantic segmentation model output. It will assign the same class to every instance of an object it comes across in an image, for example, all cats will be labeled as "cat" instead of "cat-1", "cat-2".
+We can use transformers' image segmentation pipeline to quickly infer a semantic segmentation model. Let's take a look at the example image.
+
+```python
+from transformers import pipeline
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/segmentation_input.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/segmentation_input.jpg" alt="Segmentation Input"/>
+</div>
+
+We will use [nvidia/segformer-b1-finetuned-cityscapes-1024-1024](https://huggingface.co/nvidia/segformer-b1-finetuned-cityscapes-1024-1024). 
+
+```python
+semantic_segmentation = pipeline("image-segmentation", "nvidia/segformer-b1-finetuned-cityscapes-1024-1024")
+results = semantic_segmentation(image)
+results
+```
+
+The segmentation pipeline output includes a mask for every predicted class. 
+```bash
+[{'score': None,
+  'label': 'road',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'sidewalk',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'building',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'wall',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'pole',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'traffic sign',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'vegetation',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'terrain',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': None,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>}]
+```
+
+Taking a look at the mask for the car class, we can see every car is classified with the same mask.
+
+```python
+results[-1]["mask"]
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/semantic_segmentation_output.png" alt="Semantic Segmentation Output"/>
+</div>
+
+In instance segmentation, the goal is not to classify every pixel, but to predict a mask for **every instance of an object** in a given image. It works very similar to object detection, where there is a bounding box for every instance, there's a segmentation mask instead. We will use [facebook/mask2former-swin-large-cityscapes-instance](https://huggingface.co/facebook/mask2former-swin-large-cityscapes-instance) for this. 
+
+```python
+instance_segmentation = pipeline("image-segmentation", "facebook/mask2former-swin-large-cityscapes-instance")
+results = instance_segmentation(Image.open(image))
+results
+```
+
+As you can see below, there are multiple cars classified, and there's no classification for pixels other than pixels that belong to car and person instances.
+
+```bash
+[{'score': 0.999944,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999945,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999652,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.903529,
+  'label': 'person',
+  'mask': <PIL.Image.Image image mode=L size=612x415>}]
+```
+Checking out one of the car masks below.
+
+```python
+results[2]["mask"]
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/instance_segmentation_output.png" alt="Semantic Segmentation Output"/>
+</div>
+
+Panoptic segmentation combines semantic segmentation and instance segmentation, where every pixel is classified into a class and an instance of that class, and there are multiple masks for each instance of a class. We can use [facebook/mask2former-swin-large-cityscapes-panoptic](https://huggingface.co/facebook/mask2former-swin-large-cityscapes-panoptic) for this.
+
+```python
+panoptic_segmentation = pipeline("image-segmentation", "facebook/mask2former-swin-large-cityscapes-panoptic")
+results = panoptic_segmentation(Image.open(image))
+results
+```
+As you can see below, we have more classes. We will later illustrate to see that every pixel is classified into one of the classes.
+
+```bash
+[{'score': 0.999981,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999958,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.99997,
+  'label': 'vegetation',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999575,
+  'label': 'pole',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999958,
+  'label': 'building',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999634,
+  'label': 'road',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.996092,
+  'label': 'sidewalk',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.999221,
+  'label': 'car',
+  'mask': <PIL.Image.Image image mode=L size=612x415>},
+ {'score': 0.99987,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=612x415>}]
+```
+
+Let's have a side by side comparison for all types of segmentation.
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/segmentation-comparison.png" alt="Segmentation Maps Compared"/>
+</div>
+
+Seeing all types of segmentation, let's have a deep dive on fine-tuning a model for semantic segmentation.
+
+Common real-world applications of semantic segmentation include training self-driving cars to identify pedestrians and important traffic information, identifying cells and abnormalities in medical imagery, and monitoring environmental changes from satellite imagery.
+
+## Fine-tuning a Model for Segmentation
+
+We will now:
+
+1. Finetune [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) on the [SceneParse150](https://huggingface.co/datasets/scene_parse_150) dataset.
+2. Use your fine-tuned model for inference.
+
+<Tip>
+The task illustrated in this tutorial is supported by the following model architectures:
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+
+### Load SceneParse150 dataset
 
 Start by loading a smaller subset of the SceneParse150 dataset from the 🤗 Datasets library. This'll give you a chance to experiment and make sure everything works before spending more time training on the full dataset.
 
@@ -97,7 +256,60 @@ You'll also want to create a dictionary that maps a label id to a label class wh
 >>> num_labels = len(id2label)
 ```
 
-## Preprocess
+#### Custom dataset
+
+You could also create and use your own dataset if you prefer to train with the [run_semantic_segmentation.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py) script instead of a notebook instance. The script requires:
+
+1. a [`~datasets.DatasetDict`] with two [`~datasets.Image`] columns, "image" and "label"
+
+     ```py
+     from datasets import Dataset, DatasetDict, Image
+
+     image_paths_train = ["path/to/image_1.jpg/jpg", "path/to/image_2.jpg/jpg", ..., "path/to/image_n.jpg/jpg"]
+     label_paths_train = ["path/to/annotation_1.png", "path/to/annotation_2.png", ..., "path/to/annotation_n.png"]
+
+     image_paths_validation = [...]
+     label_paths_validation = [...]
+
+     def create_dataset(image_paths, label_paths):
+         dataset = Dataset.from_dict({"image": sorted(image_paths),
+                                     "label": sorted(label_paths)})
+         dataset = dataset.cast_column("image", Image())
+         dataset = dataset.cast_column("label", Image())
+
+     return dataset
+
+     # step 1: create Dataset objects
+     train_dataset = create_dataset(image_paths_train, label_paths_train)
+     validation_dataset = create_dataset(image_paths_validation, label_paths_validation)
+
+     # step 2: create DatasetDict
+     dataset = DatasetDict({
+          "train": train_dataset,
+          "validation": validation_dataset,
+          }
+     )
+
+     # step 3: push to Hub (assumes you have ran the huggingface-cli login command in a terminal/notebook)
+     dataset.push_to_hub("your-name/dataset-repo")
+
+     # optionally, you can push to a private repo on the Hub
+     # dataset.push_to_hub("name of repo on the hub", private=True)
+     ```
+
+2. an id2label dictionary mapping the class integers to their class names
+
+     ```py
+     import json
+     # simple example
+     id2label = {0: 'cat', 1: 'dog'}
+     with open('id2label.json', 'w') as fp:
+     json.dump(id2label, fp)
+     ```
+
+As an example, take a look at this [example dataset](https://huggingface.co/datasets/nielsr/ade20k-demo) which was created with the steps shown above.
+
+### Preprocess
 
 The next step is to load a SegFormer image processor to prepare the images and annotations for the model. Some datasets, like this one, use the zero-index as the background class. However, the background class isn't actually included in the 150 classes, so you'll need to set `reduce_labels=True` to subtract one from all the labels. The zero-index is replaced by `255` so it's ignored by SegFormer's loss function:
 
@@ -204,7 +416,7 @@ The transform is applied on the fly which is faster and consumes less disk space
 </tf>
 </frameworkcontent>
 
-## Evaluate
+### Evaluate
 
 Including a metric during training is often helpful for evaluating your model's performance. You can quickly load an evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):
 
@@ -289,7 +501,7 @@ logits first, and then reshaped to match the size of the labels before you can c
 
 Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.
 
-## Train
+### Train
 <frameworkcontent>
 <pt>
 <Tip>
@@ -453,7 +665,7 @@ Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. Y
 </frameworkcontent>
 
 
-## Inference
+### Inference
 
 Great, now that you've finetuned a model, you can use it for inference!
 
@@ -470,43 +682,8 @@ Load an image for inference:
 
 <frameworkcontent>
 <pt>
-The simplest way to try out your finetuned model for inference is to use it in a [`pipeline`]. Instantiate a `pipeline` for image segmentation with your model, and pass your image to it:
-
-```py
->>> from transformers import pipeline
-
->>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
->>> segmenter(image)
-[{'score': None,
-  'label': 'wall',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
- {'score': None,
-  'label': 'sky',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
- {'score': None,
-  'label': 'floor',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
- {'score': None,
-  'label': 'ceiling',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
- {'score': None,
-  'label': 'bed ',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
- {'score': None,
-  'label': 'windowpane',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
- {'score': None,
-  'label': 'cabinet',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
- {'score': None,
-  'label': 'chair',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
- {'score': None,
-  'label': 'armchair',
-  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
-```
 
-You can also manually replicate the results of the `pipeline` if you'd like. Process the image with an image processor and place the `pixel_values` on a GPU:
+We will now see how to infer without a pipeline. Process the image with an image processor and place the `pixel_values` on a GPU:
 
 ```py
 >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
diff --git a/docs/source/en/tasks/sequence_classification.md b/docs/source/en/tasks/sequence_classification.md
index c6daa66f362f2a..4a0e5b611c9136 100644
--- a/docs/source/en/tasks/sequence_classification.md
+++ b/docs/source/en/tasks/sequence_classification.md
@@ -33,7 +33,7 @@ The task illustrated in this tutorial is supported by the following model archit
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
 
-[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [CodeLlama](../model_doc/code_llama), [ConvBERT](../model_doc/convbert), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [LLaMA](../model_doc/llama), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Perceiver](../model_doc/perceiver), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [TAPAS](../model_doc/tapas), [Transformer-XL](../model_doc/transfo-xl), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
 
 
 
diff --git a/docs/source/en/tasks/summarization.md b/docs/source/en/tasks/summarization.md
index a7d6a446c0c3e4..535d20ff492b49 100644
--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SeamlessM4Tv2](../model_doc/seamless_m4t_v2), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
 
 <!--End of the generated tip-->
 
@@ -126,6 +126,7 @@ Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more effic
 
 <frameworkcontent>
 <pt>
+
 ```py
 >>> from transformers import DataCollatorForSeq2Seq
 
@@ -133,6 +134,7 @@ Now create a batch of examples using [`DataCollatorForSeq2Seq`]. It's more effic
 ```
 </pt>
 <tf>
+
 ```py
 >>> from transformers import DataCollatorForSeq2Seq
 
diff --git a/docs/source/en/tasks/text-to-speech.md b/docs/source/en/tasks/text-to-speech.md
index 86a0d49fd04d48..216c3c1f1133f7 100644
--- a/docs/source/en/tasks/text-to-speech.md
+++ b/docs/source/en/tasks/text-to-speech.md
@@ -74,6 +74,12 @@ To follow this guide you will need a GPU. If you're working in a notebook, run t
 !nvidia-smi
 ```
 
+or alternatively for AMD GPUs:
+
+```bash
+!rocm-smi
+```
+
 </Tip>
 
 We encourage you to log in to your Hugging Face account to upload and share your model with the community. When prompted, enter your token to log in:
@@ -630,4 +636,4 @@ see if this improves the results.
 
 Finally, it is essential to consider ethical considerations. Although TTS technology has numerous useful applications, it 
 may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. Please 
-use TTS judiciously and responsibly.
\ No newline at end of file
+use TTS judiciously and responsibly.
diff --git a/docs/source/en/tasks/translation.md b/docs/source/en/tasks/translation.md
index c17e3db23f2e8a..9c73e97bff366f 100644
--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit
 
 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
 
-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SeamlessM4Tv2](../model_doc/seamless_m4t_v2), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
 
 <!--End of the generated tip-->
 
@@ -232,7 +232,7 @@ At this point, only three steps remain:
 ... )
 
 >>> trainer.train()
-````
+```
 
 Once training is completed, share your model to the Hub with the [`~transformers.Trainer.push_to_hub`] method so everyone can use your model:
 
diff --git a/docs/source/en/training.md b/docs/source/en/training.md
index fb4a0b6a279ef9..8e81048bf54e0e 100644
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@@ -43,7 +43,7 @@ Begin by loading the [Yelp Reviews](https://huggingface.co/datasets/yelp_review_
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
 ```
 
-As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) method to apply a preprocessing function over the entire dataset:
+As you now know, you need a tokenizer to process the text and include a padding and truncation strategy to handle any variable sequence lengths. To process your dataset in one step, use 🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) method to apply a preprocessing function over the entire dataset:
 
 ```py
 >>> from transformers import AutoTokenizer
@@ -119,7 +119,7 @@ Specify where to save the checkpoints from your training:
 >>> metric = evaluate.load("accuracy")
 ```
 
-Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the predictions to logits (remember all 🤗 Transformers models return logits):
+Call [`~evaluate.compute`] on `metric` to calculate the accuracy of your predictions. Before passing your predictions to `compute`, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits):
 
 ```py
 >>> def compute_metrics(eval_pred):
diff --git a/docs/source/es/_toctree.yml b/docs/source/es/_toctree.yml
index dd110b746c6ee6..0b939ad4113a04 100644
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@@ -75,6 +75,10 @@
 - sections:
   - local: philosophy
     title: Filosofía
+  - local: pad_truncation
+    title: Relleno y truncamiento
   - local: bertology
     title: BERTología
+  - local: perplexity
+    title: Perplejidad de los modelos de longitud fija
   title: Guías conceptuales
diff --git a/docs/source/es/converting_tensorflow_models.md b/docs/source/es/converting_tensorflow_models.md
index c7e22bddac705a..8e5b1ad1e288f2 100644
--- a/docs/source/es/converting_tensorflow_models.md
+++ b/docs/source/es/converting_tensorflow_models.md
@@ -96,20 +96,6 @@ transformers-cli convert --model_type gpt2 \
   [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 ```
 
-## Transformer-XL
-
-Aquí hay un ejemplo del proceso para convertir un modelo Transformer-XL pre-entrenado (más información [aquí](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
 ## XLNet
 
 Aquí hay un ejemplo del proceso para convertir un modelo XLNet pre-entrenado:
diff --git a/docs/source/es/pad_truncation.md b/docs/source/es/pad_truncation.md
new file mode 100644
index 00000000000000..6a31a69103502f
--- /dev/null
+++ b/docs/source/es/pad_truncation.md
@@ -0,0 +1,69 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Relleno y truncamiento
+
+Las entradas agrupadas por lotes (batched) suelen tener longitudes diferentes, por lo que no se pueden convertir en tensores de tamaño fijo. El relleno (también conocido como "Padding") y el truncamiento (conocido como "Truncation") son estrategias para abordar este problema y crear tensores rectangulares a partir de lotes de longitudes variables. El relleno agrega un **padding token** especial para garantizar que las secuencias más cortas tengan la misma longitud que la secuencia más larga en un lote o la longitud máxima aceptada por el modelo. El truncamiento funciona en la otra dirección al truncar secuencias largas.
+
+En la mayoría de los casos, es bastante eficaz rellenar el lote hasta la longitud de la secuencia más larga y truncar hasta la longitud máxima que un modelo puede aceptar. Sin embargo, la API admite más estrategias si las necesitas. Los tres argumentos que necesitas son: `padding`, `truncation` y `max_length`.
+
+El argumento `padding` controla el relleno. Puede ser un booleano o una cadena:
+
+  - `True` o `'longest'`: rellena hasta la longitud de la secuencia más larga en el lote (no se aplica relleno si solo proporcionas una única secuencia).
+  - `'max_length'`: rellena hasta una longitud especificada por el argumento `max_length` o la longitud máxima aceptada
+    por el modelo si no se proporciona `max_length` (`max_length=None`). El relleno se aplicará incluso si solo proporcionas una única secuencia.
+  - `False` o `'do_not_pad'`: no se aplica relleno. Este es el comportamiento predeterminado.
+
+El argumento `truncation` controla el truncamiento. Puede ser un booleano o una cadena:
+
+  - `True` o `'longest_first'`: trunca hasta una longitud máxima especificada por el argumento `max_length` o
+    la longitud máxima aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto
+    truncará token por token, eliminando un token de la secuencia más larga en el par hasta alcanzar la longitud adecuada.
+  - `'only_second'`: trunca hasta una longitud máxima especificada por el argumento `max_length` o la longitud máxima
+    aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto solo truncará
+    la segunda oración de un par si se proporciona un par de secuencias (o un lote de pares de secuencias).
+  - `'only_first'`: trunca hasta una longitud máxima especificada por el argumento `max_length` o la longitud máxima
+    aceptada por el modelo si no se proporciona `max_length` (`max_length=None`). Esto solo truncará
+    la primera oración de un par si se proporciona un par de secuencias (o un lote de pares de secuencias).
+  - `False` o `'do_not_truncate'`: no se aplica truncamiento. Este es el comportamiento predeterminado.
+
+El argumento `max_length` controla la longitud del relleno y del truncamiento. Puede ser un número entero o `None`, en cuyo caso se establecerá automáticamente en la longitud máxima que el modelo puede aceptar. Si el modelo no tiene una longitud máxima de entrada específica, se desactiva el truncamiento o el relleno hasta `max_length`.
+
+La siguiente tabla resume la forma recomendada de configurar el relleno y el truncamiento. Si usas pares de secuencias de entrada en alguno de los siguientes ejemplos, puedes reemplazar `truncation=True` por una `ESTRATEGIA` seleccionada en
+`['only_first', 'only_second', 'longest_first']`, es decir, `truncation='only_second'` o `truncation='longest_first'` para controlar cómo se truncan ambas secuencias en el par, como se detalló anteriormente.
+
+| Truncation                           | Padding                             | Instrucción                                                                                 |
+|-----------------------------------------|--------------------------------------|---------------------------------------------------------------------------------------------|
+| sin truncamiento                       | sin relleno                          | `tokenizer(batch_sentences)`                                                           |
+|                                         | relleno hasta la longitud máxima del lote | `tokenizer(batch_sentences, padding=True)` o                                          |
+|                                         |                                      | `tokenizer(batch_sentences, padding='longest')`                                        |
+|                                         | relleno hasta la longitud máxima del modelo | `tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                         | relleno hasta una longitud específica | `tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+|                                         | relleno hasta un múltiplo de un valor | `tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8)`                        |
+| truncamiento hasta la longitud máxima del modelo | sin relleno                          | `tokenizer(batch_sentences, truncation=True)` o                                       |
+|                                         |                                      | `tokenizer(batch_sentences, truncation=ESTRATEGIA)`                                      |
+|                                         | relleno hasta la longitud máxima del lote | `tokenizer(batch_sentences, padding=True, truncation=True)` o                         |
+|                                         |                                      | `tokenizer(batch_sentences, padding=True, truncation=ESTRATEGIA)`                        |
+|                                         | relleno hasta la longitud máxima del modelo | `tokenizer(batch_sentences, padding='max_length', truncation=True)` o                 |
+|                                         |                                      | `tokenizer(batch_sentences, padding='max_length', truncation=ESTRATEGIA)`                |
+|                                         | relleno hasta una longitud específica | No es posible                                                                                |
+| truncamiento hasta una longitud específica | sin relleno                          | `tokenizer(batch_sentences, truncation=True, max_length=42)` o                        |
+|                                         |                                      | `tokenizer(batch_sentences, truncation=ESTRATEGIA, max_length=42)`                       |
+|                                         | relleno hasta la longitud máxima del lote | `tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` o          |
+|                                         |                                      | `tokenizer(batch_sentences, padding=True, truncation=ESTRATEGIA, max_length=42)`         |
+|                                         | relleno hasta la longitud máxima del modelo | No es posible                                                                                |
+|                                         | relleno hasta una longitud específica | `tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` o  |
+|                                         |                                      | `tokenizer(batch_sentences, padding='max_length', truncation=ESTRATEGIA, max_length=42)` |
diff --git a/docs/source/es/perplexity.md b/docs/source/es/perplexity.md
new file mode 100644
index 00000000000000..3e96e9865586f7
--- /dev/null
+++ b/docs/source/es/perplexity.md
@@ -0,0 +1,116 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Perplejidad de los modelos de longitud fija
+
+[[open-in-colab]]
+
+La perplejidad, perplexity en inglés (PPL), es una de las métricas más comunes para evaluar modelos de lenguaje. Antes de sumergirnos, debemos tener en cuenta que esta métrica se aplica específicamente a modelos de lenguaje clásicos (a veces llamados modelos autorregresivos o causales) y no está bien definida para modelos de lenguaje enmascarados como BERT (ver [resumen del modelo](model_summary)).
+
+La perplejidad se define como la media negativa exponenciada del log-likelihood de una secuencia. Si tenemos una secuencia tokenizada \\(X = (x_0, x_1, \dots, x_t)\\), entonces la perplejidad de \\(X\\) es,
+
+$$\text{PPL}(X) = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}$$
+
+donde \\(\log p_\theta (x_i|x_{<i})\\) es el log-likelihood del token i-ésimo condicionado a los tokens precedentes \\(x_{<i}\\) según nuestro modelo. De manera intuitiva, se puede pensar en esto como una evaluación de la capacidad del modelo para predecir de manera uniforme entre el conjunto de tokens especificados en un corpus. Es importante destacar que el procedimiento de tokenización tiene un impacto directo en la perplejidad de un modelo, lo cual siempre debe tenerse en cuenta al comparar diferentes modelos.
+
+Esto también es equivalente a la exponenciación de la entropía cruzada entre los datos y las predicciones del modelo. Para obtener más intuición sobre la perplejidad y su relación con los Bits Por Carácter (BPC) y la compresión de datos, echa un vistazo a esta [fantástica publicación en el blog de "The Gradient"](https://thegradient.pub/understanding-evaluation-metrics-for-language-models/).
+
+## Cálculo de PPL con modelos de longitud fija
+
+Si no estuviéramos limitados por el tamaño del contexto de un modelo, evaluaríamos la perplejidad (PPL) del modelo auto regresivamente factorizando una secuencia y condicionándonos en toda la subsecuencia precedente en cada paso, como se muestra a continuación.
+
+<img width="600" alt="Full decomposition of a sequence with unlimited context length" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_full.gif"/>
+
+Sin embargo, al trabajar con modelos aproximados, generalmente tenemos una restricción en la cantidad de tokens que el modelo puede procesar. La versión más grande de [GPT-2](model_doc/gpt2), por ejemplo, tiene una longitud fija de 1024 tokens, por lo que no podemos calcular \\(p_\theta(x_t|x_{<t})\\) directamente cuando \\(t\\) es mayor que 1024.
+
+En cambio, la secuencia se divide típicamente en subsecuencias iguales al tamaño máximo de entrada del modelo. Si el tamaño máximo de entrada, de un modelo es \\(k\\), entonces aproximamos la probabilidad de un token \\(x_t\\) condicionándonos solo en los \\(k-1\\) tokens que lo preceden en lugar de todo el contexto. Al evaluar la perplejidad del modelo en una secuencia, un enfoque tentador pero sub óptimo es dividir la secuencia en fragmentos independientes y sumar los log-likelihood descompuestos de cada segmento de manera independiente.
+
+<img width="600" alt="Suboptimal PPL not taking advantage of full available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_chunked.gif"/>
+
+Esto es rápido de calcular, ya que la perplejidad de cada segmento se puede calcular en un solo pase hacia adelante, pero sirve como una aproximación pobre de la perplejidad completamente factorizada y generalmente dará como resultado una PPL más alta (peor) porque el modelo tendrá menos contexto en la mayoría de los pasos de predicción.
+
+En cambio, la PPL de modelos de longitud fija debería evaluarse con una estrategia de ventana deslizante. Esto implica deslizar repetidamente la ventana de contexto para que el modelo tenga más contexto al hacer cada predicción.
+
+<img width="600" alt="Sliding window PPL taking advantage of all available context" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/ppl_sliding.gif"/>
+
+Esta es una aproximación más cercana a la verdadera descomposición de la probabilidad de la secuencia y generalmente dará como resultado una puntuación más favorable. La desventaja es que requiere un pase hacia adelante separado para cada token en el corpus. Un buen compromiso práctico es emplear una ventana deslizante estratificada, moviendo el contexto con pasos más grandes en lugar de deslizarse de 1 token a la vez. Esto permite que la computación avance mucho más rápido, mientras le da al modelo un contexto amplio para hacer
+predicciones en cada paso.
+
+## Ejemplo: Cálculo de la perplejidad con GPT-2 en 🤗 Transformers
+
+Demostremos este proceso con GPT-2.
+
+```python
+from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+
+device = "cuda"
+model_id = "gpt2-large"
+model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+```
+
+Carguemos el conjunto de datos WikiText-2 y evaluemos la perplejidad utilizando algunas estrategias de ventana deslizante diferentes. Dado que este conjunto de datos es pequeño y solo estamos realizando un pase hacia adelante sobre el conjunto, podemos cargar y codificar todo el conjunto de datos en la memoria.
+
+```python
+from datasets import load_dataset
+
+test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")
+```
+
+Con 🤗 Transformers, simplemente podemos pasar los `input_ids` como las `labels` a nuestro modelo, y la media negativa del log-likelihood para cada token se devuelve como la pérdida. Sin embargo, con nuestro enfoque de ventana deslizante, hay superposición en los tokens que pasamos al modelo en cada iteración. No queremos que el log-likelihood de los tokens que estamos tratando solo como contexto se incluya en nuestra pérdida, por lo que podemos establecer estos objetivos en `-100` para que se ignoren. El siguiente es un ejemplo de cómo podríamos hacer esto con un paso de `512`. Esto significa que el modelo tendrá al menos `512` tokens como contexto al calcular el log-likelihood condicional de cualquier token (siempre que haya `512` tokens precedentes disponibles para condicionar).
+
+```python
+import torch
+from tqdm import tqdm
+
+max_length = model.config.n_positions
+stride = 512
+seq_len = encodings.input_ids.size(1)
+
+nlls = []
+prev_end_loc = 0
+for begin_loc in tqdm(range(0, seq_len, stride)):
+    end_loc = min(begin_loc + max_length, seq_len)
+    trg_len = end_loc - prev_end_loc  # puede ser diferente del paso en el último bucle
+    input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
+    target_ids = input_ids.clone()
+    target_ids[:, :-trg_len] = -100
+
+    with torch.no_grad():
+        outputs = model(input_ids, labels=target_ids)
+
+        # la pérdida se calcula utilizando CrossEntropyLoss, que promedia las etiquetas válidas
+        # N.B. el modelo solo calcula la pérdida sobre trg_len - 1 etiquetas, porque desplaza las etiqueta internamente
+        # a la izquierda por 1.
+        neg_log_likelihood = outputs.loss
+
+    nlls.append(neg_log_likelihood)
+
+    prev_end_loc = end_loc
+    if end_loc == seq_len:
+        break
+
+ppl = torch.exp(torch.stack(nlls).mean())
+```
+
+Ejecuta esto con la longitud de paso igual a la longitud máxima de entrada es equivalente a la estrategia sub óptima,
+sin ventana deslizante, que discutimos anteriormente. Cuanto menor sea el paso, más contexto tendrá el modelo para
+realizar cada predicción y, por lo general, mejor será la perplejidad informada.
+
+Cuando ejecutamos lo anterior con `stride = 1024`, es decir, sin superposición, la PPL resultante es `19.44`, que es
+aproximadamente la misma que la `19.93` informada en el artículo de GPT-2. Al utilizar `stride = 512` y, por lo tanto,
+emplear nuestra estrategia de ventana deslizante, esto disminuye a `16.45`. Esto no solo es una puntuación más favorable, sino que se calcula de una manera más cercana a la verdadera descomposición autorregresiva de la probabilidad de una secuencia.
diff --git a/docs/source/es/preprocessing.md b/docs/source/es/preprocessing.md
index f4eec4862be8be..5ac4c018090bf1 100644
--- a/docs/source/es/preprocessing.md
+++ b/docs/source/es/preprocessing.md
@@ -195,7 +195,7 @@ Las entradas de audio se preprocesan de forma diferente a las entradas textuales
 pip install datasets
 ```
 
-Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub.html) para que obtengas más detalles sobre cómo cargar un dataset):
+Carga la tarea de detección de palabras clave del benchmark [SUPERB](https://huggingface.co/datasets/superb) (consulta el [tutorial 🤗 Dataset](https://huggingface.co/docs/datasets/load_hub) para que obtengas más detalles sobre cómo cargar un dataset):
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -234,7 +234,7 @@ Por ejemplo, carga el dataset [LJ Speech](https://huggingface.co/datasets/lj_spe
  'sampling_rate': 22050}
 ```
 
-1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz:
+1. Usa el método 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.cast_column) para reducir la tasa de muestreo a 16kHz:
 
 ```py
 >>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
@@ -329,7 +329,7 @@ Vamos a cargar el dataset [food101](https://huggingface.co/datasets/food101) par
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image):
+A continuación, observa la imagen con la función 🤗 Datasets [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image):
 
 ```py
 >>> dataset[0]["image"]
@@ -370,7 +370,7 @@ Para las tareas de visión por computadora es común añadir algún tipo de aume
 ...     return examples
 ```
 
-3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) para aplicar las transformaciones sobre la marcha:
+3. A continuación, utiliza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process#format-transform) para aplicar las transformaciones sobre la marcha:
 
 ```py
 >>> dataset.set_transform(transforms)
diff --git a/docs/source/es/run_scripts.md b/docs/source/es/run_scripts.md
index a66fd1e47e1386..8b762fdddc28fc 100644
--- a/docs/source/es/run_scripts.md
+++ b/docs/source/es/run_scripts.md
@@ -130,7 +130,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 - Establece la cantidad de GPU que se usará con el argumento `nproc_per_node`.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md
index 3a959aa934ffa8..f09730caf69fee 100644
--- a/docs/source/es/tasks/image_classification.md
+++ b/docs/source/es/tasks/image_classification.md
@@ -99,7 +99,7 @@ Crea una función de preprocesamiento que aplique las transformaciones y devuelv
 ...     return examples
 ```
 
-Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset:
+Utiliza el método [`with_transform`](https://huggingface.co/docs/datasets/package_reference/main_classes?#datasets.Dataset.with_transform) de 🤗 Dataset para aplicar las transformaciones sobre todo el dataset. Las transformaciones se aplican sobre la marcha cuando se carga un elemento del dataset:
 
 ```py
 >>> food = food.with_transform(transforms)
diff --git a/docs/source/es/tasks/language_modeling.md b/docs/source/es/tasks/language_modeling.md
index 66ac8fb0d4b56a..b3f22f0846335d 100644
--- a/docs/source/es/tasks/language_modeling.md
+++ b/docs/source/es/tasks/language_modeling.md
@@ -94,7 +94,7 @@ Para modelados de lenguaje por enmascaramiento carga el tokenizador DistilRoBERT
 >>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
 ```
 
-Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten):
+Extrae el subcampo `text` desde su estructura anidado con el método [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
 
 ```py
 >>> eli5 = eli5.flatten()
@@ -249,7 +249,7 @@ A este punto, solo faltan tres pasos:
 ```
 </pt>
 <tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
+Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
 
 ```py
 >>> tf_train_set = lm_dataset["train"].to_tf_dataset(
@@ -356,7 +356,7 @@ A este punto, solo faltan tres pasos:
 ```
 </pt>
 <tf>
-Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
+Para realizar el fine-tuning de un modelo en TensorFlow, comienza por convertir tus datasets al formato `tf.data.Dataset` con [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset). Especifica los inputs y etiquetas en `columns`, ya sea para mezclar el dataset, tamaño de lote, y el data collator:
 
 ```py
 >>> tf_train_set = lm_dataset["train"].to_tf_dataset(
diff --git a/docs/source/es/training.md b/docs/source/es/training.md
index 7b7b0657bd8f16..4f224b0797a3b9 100644
--- a/docs/source/es/training.md
+++ b/docs/source/es/training.md
@@ -102,7 +102,7 @@ Especifica dónde vas a guardar los checkpoints de tu entrenamiento:
 
 ### Métricas
 
-El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para más información):
+El [`Trainer`] no evalúa automáticamente el rendimiento del modelo durante el entrenamiento. Tendrás que pasarle a [`Trainer`] una función para calcular y hacer un reporte de las métricas. La biblioteca de 🤗 Datasets proporciona una función de [`accuracy`](https://huggingface.co/metrics/accuracy) simple que puedes cargar con la función `load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics) para más información):
 
 ```py
 >>> import numpy as np
@@ -172,7 +172,7 @@ El [`DefaultDataCollator`] junta los tensores en un batch para que el modelo se
 
 </Tip>
 
-A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`:
+A continuación, convierte los datasets tokenizados en datasets de TensorFlow con el método [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset). Especifica tus entradas en `columns` y tu etiqueta en `label_cols`:
 
 ```py
 >>> tf_train_dataset = small_train_dataset.to_tf_dataset(
@@ -342,7 +342,7 @@ Para hacer un seguimiento al progreso del entrenamiento, utiliza la biblioteca [
 
 ### Métricas
 
-De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final.
+De la misma manera que necesitas añadir una función de evaluación al [`Trainer`], necesitas hacer lo mismo cuando escribas tu propio ciclo de entrenamiento. Pero en lugar de calcular y reportar la métrica al final de cada época, esta vez acumularás todos los batches con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=add_batch#datasets.Metric.add_batch) y calcularás la métrica al final.
 
 ```py
 >>> metric = load_metric("accuracy")
diff --git a/docs/source/fr/_toctree.yml b/docs/source/fr/_toctree.yml
index 11632a423b6af1..12c2feb0a02eb5 100755
--- a/docs/source/fr/_toctree.yml
+++ b/docs/source/fr/_toctree.yml
@@ -1,156 +1,30 @@
 - sections:
-  - local: index
-    title: 🤗 Transformers
-  - local: quicktour
-    title: Visite rapide
-  - local: in_translation
-    title: Installation
+    - local: index
+      title: 🤗 Transformers
+    - local: quicktour
+      title: Visite rapide
+    - local: installation
+      title: Installation
   title: Démarrer
 - sections:
-  - local: in_translation
-    title: Pipelines pour l'inférence
-  - local: in_translation
-    title: Chargement d'instances pré-entraînées avec une AutoClass
-  - local: in_translation
-    title: Préparation des données
-  - local: in_translation
-    title: Fine-tune un modèle pré-entraîné
-  - local: in_translation
-    title: Entraînement distribué avec 🤗 Accelerate
-  - local: in_translation
-    title: Partager un modèle
-  title: Tutoriels
-- sections:
-  - sections:
-    - local: in_translation
-      title: Créer votre architecture
-    - local: in_translation
-      title: Partager vos modèles
-    - local: in_translation
-      title: Entraînement avec un script
-    - local: in_translation
-      title: Entraînement avec Amazon SageMaker
-    - local: in_translation
-      title: Convertir depuis des checkpoints Tensorflow
-    - local: in_translation
-      title: Exporter vers ONNX
-    - local: in_translation
-      title: Exporter vers TorchScript
-    - local: in_translation
-      title: Aide au dépannage
-    title: Usage général
-  - sections:
-    - local: in_translation
-      title: Utiliser les tokenizers de 🤗 Tokenizers
-    - local: in_translation
-      title: Inférence avec les modèles multilingues
-    - local: in_translation
-      title: Stratégies de génération de texte
-    - sections:
-      - isExpanded: false
-        local: in_translation
-        title: Classification de texte
-      - local: in_translation
-        title: Classification de token
-      - local: in_translation
-        title: Système de question-réponse
-      - local: in_translation
-        title: Modélisation causale du langage
-      - local: in_translation
-        title: Modélisation du langage avec masque
-      - local: in_translation
-        title: Traduction
-      - local: in_translation
-        title: Génération de résumé
-      - local: in_translation
-        title: Question à choix multiple
-      title: Guides des tâches
-    title: Traitement automatique des langues
-  - sections:
-    - local: in_translation
-      title: Classification audio
-    - local: in_translation
-      title: Reconnaissance automatique de la parole
-    title: Audio
-  - sections:
     - local: in_translation
-      title: Classification d'images
+      title: Pipelines pour l'inférence
+    - local: autoclass_tutorial
+      title: Chargement d'instances pré-entraînées avec une AutoClass
     - local: in_translation
-      title: Segmentation sémantique
+      title: Préparation des données
     - local: in_translation
-      title: Classification de vidéos
+      title: Fine-tune un modèle pré-entraîné
     - local: in_translation
-      title: Détection d'objets
-    title: Vision par ordinateur
-  - sections:
-    - local: in_translation
-      title: Performance et extensibilité
-  - sections:
-    - local: in_translation
-      title: Comment contribuer à transformers?
-    - local: in_translation
-      title: Comment ajouter un modèle à 🤗 Transformers?
-    - local: in_translation
-      title: Comment convertir un modèle 🤗 Transformers vers TensorFlow?
-    - local: in_translation
-      title: Comment ajouter un pipeline à 🤗 Transformers?
-    - local: in_translation
-      title: Tester
-    - local: in_translation
-      title: Vérification pour une Pull Request
-    title: Contribuer
-  - local: in_translation
-    title: 🤗 Transformers Notebooks
-  - local: in_translation
-    title: Ressources communautaires
-  - local: in_translation
-    title: Benchmarks
-  - local: in_translation
-    title: Migration à partir de versions précédentes
-  title: Guides d'utilisation
-- sections:
-  - local: in_translation
-    title: Philosophie
-  - local: in_translation
-    title: Glossaire
-  - local: in_translation
-    title: Qu'est ce 🤗 Transformers peut faire ?
-  - local: in_translation
-    title: Quelles tâches 🤗 Transformers peut résoudre ?
-  - local: in_translation
-    title: Résumé des modèles
-  - local: in_translation
-    title: Résumé des tokenizers
-  - local: in_translation
-    title: Remplissage et troncature
-  - local: in_translation
-    title: BERTology
-  - local: in_translation
-    title: Perplexité des modèles à longueur fixe
-  - local: in_translation
-    title: Pipelines pour inférence avec des serveurs web
-  title: Guides conceptuels
-- sections:
-  - isExpanded: false
-    sections:
-    - local: in_translation
-      title: Classes principales
-    - local: in_translation
-      title: Modèles textuels
-    - local: in_translation
-      title: Modèles visuels
-    - local: in_translation
-      title: Modèles audio
+      title: Entraînement avec un script
     - local: in_translation
-      title: Modèles multimodal
+      title: Entraînement distribué avec 🤗 Accelerate
     - local: in_translation
-      title: Modèles d'apprentissage par renforcement
+      title: Chargement et entraînement des adaptateurs avec 🤗 PEFT
     - local: in_translation
-      title: Modèles de séries temporelles
+      title: Partager un modèle
     - local: in_translation
-      title: Graph models
-    title: Modèles
-  - sections:
+      title: Agents
     - local: in_translation
-      title: Utilitaires internes
-  title: API
+      title: Génération avec LLMs
+  title: Tutoriels
diff --git a/docs/source/fr/autoclass_tutorial.md b/docs/source/fr/autoclass_tutorial.md
new file mode 100644
index 00000000000000..392e2a6807e55d
--- /dev/null
+++ b/docs/source/fr/autoclass_tutorial.md
@@ -0,0 +1,142 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Chargement d'instances pré-entraînées avec une AutoClass
+
+Avec autant d'architectures Transformer différentes, il peut être difficile d'en créer une pour votre ensemble de poids (aussi appelés "weights" ou "checkpoint" en anglais). Dans l'idée de créer une librairie facile, simple et flexible à utiliser, 🤗 Transformers fournit une `AutoClass` qui infère et charge automatiquement l'architecture correcte à partir d'un ensemble de poids donné. La fonction `from_pretrained()` vous permet de charger rapidement un modèle pré-entraîné pour n'importe quelle architecture afin que vous n'ayez pas à consacrer du temps et des ressources à l'entraînement d'un modèle à partir de zéro. Produire un tel code indépendant d'un ensemble de poids signifie que si votre code fonctionne pour un ensemble de poids, il fonctionnera avec un autre ensemble - tant qu'il a été entraîné pour une tâche similaire - même si l'architecture est différente.
+
+<Tip>
+
+Rappel, l'architecture fait référence au squelette du modèle et l'ensemble de poids contient les poids pour une architecture donnée. Par exemple, [BERT](https://huggingface.co/bert-base-uncased) est une architecture, tandis que `bert-base-uncased` est un ensemble de poids. Le terme modèle est général et peut signifier soit architecture soit ensemble de poids.
+
+</Tip>
+
+Dans ce tutoriel, vous apprendrez à:
+
+  * Charger un tokenizer pré-entraîné.
+  * Charger un processeur d'image pré-entraîné.
+  * Charger un extracteur de caractéristiques pré-entraîné.
+  * Charger un processeur pré-entraîné.
+  * Charger un modèle pré-entraîné.
+
+## AutoTokenizer
+
+Quasiment toutes les tâches de traitement du langage (NLP) commencent avec un tokenizer. Un tokenizer convertit votre texte initial dans un format qui peut être traité par le modèle.
+
+Chargez un tokenizer avec [`AutoTokenizer.from_pretrained`]:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+Puis, transformez votre texte initial comme montré ci-dessous:
+
+```py
+>>> sequence = "In a hole in the ground there lived a hobbit."
+>>> print(tokenizer(sequence))
+{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], 
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+## AutoImageProcessor
+
+Pour les tâches de vision, un processeur d'image traite l'image pour la formater correctment.
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+## AutoFeatureExtractor
+
+Pour les tâches audio, un extracteur de caractéristiques (aussi appelés "features" en anglais) traite le signal audio pour le formater correctement.
+
+Chargez un extracteur de caractéristiques avec [`AutoFeatureExtractor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained(
+...     "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
+... )
+```
+
+## AutoProcessor
+
+Les tâches multimodales nécessitent un processeur qui combine deux types d'outils de prétraitement. Par exemple, le modèle [LayoutLMV2](model_doc/layoutlmv2) nécessite un processeur d'image pour traiter les images et un tokenizer pour traiter le texte ; un processeur combine les deux.
+
+Chargez un processeur avec [`AutoProcessor.from_pretrained`]:
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+```
+
+## AutoModel
+
+<frameworkcontent>
+<pt>
+Enfin, les classes `AutoModelFor` vous permettent de charger un modèle pré-entraîné pour une tâche donnée (voir [ici](model_doc/auto) pour une liste complète des tâches disponibles). Par exemple, chargez un modèle pour la classification de séquence avec [`AutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Réutilisez facilement le même ensemble de poids pour charger une architecture pour une tâche différente :
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip warning={true}>
+
+Pour les modèles PyTorch, la fonction `from_pretrained()` utilise `torch.load()` qui utilise `pickle` en interne et est connu pour être non sécurisé. En général, ne chargez jamais un modèle qui pourrait provenir d'une source non fiable, ou qui pourrait avoir été altéré. Ce risque de sécurité est partiellement atténué pour les modèles hébergés publiquement sur le Hugging Face Hub, qui sont [scannés pour les logiciels malveillants](https://huggingface.co/docs/hub/security-malware) à chaque modification. Consultez la [documentation du Hub](https://huggingface.co/docs/hub/security) pour connaître les meilleures pratiques comme la [vérification des modifications signées](https://huggingface.co/docs/hub/security-gpg#signing-commits-with-gpg) avec GPG.
+
+Les points de contrôle TensorFlow et Flax ne sont pas concernés, et peuvent être chargés dans des architectures PyTorch en utilisant les arguments `from_tf` et `from_flax` de la fonction `from_pretrained` pour contourner ce problème.
+
+</Tip>
+
+En général, nous recommandons d'utiliser les classes `AutoTokenizer` et `AutoModelFor` pour charger des instances pré-entraînées de tokenizers et modèles respectivement. Cela vous permettra de charger la bonne architecture à chaque fois. Dans le prochain [tutoriel](preprocessing), vous apprenez à utiliser un tokenizer, processeur d'image, extracteur de caractéristiques et processeur pour pré-traiter un jeu de données pour le fine-tuning.
+</pt>
+<tf>
+Enfin, les classes `TFAutoModelFor` vous permettent de charger un modèle pré-entraîné pour une tâche donnée (voir [ici](model_doc/auto) pour une liste complète des tâches disponibles). Par exemple, chargez un modèle pour la classification de séquence avec [`TFAutoModelForSequenceClassification.from_pretrained`]:
+
+```py
+>>> from transformers import TFAutoModelForSequenceClassification
+
+>>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+Réutilisez facilement le même ensemble de poids pour charger une architecture pour une tâche différente :
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("distilbert-base-uncased")
+```
+
+En général, nous recommandons d'utiliser les classes `AutoTokenizer` et `TFAutoModelFor` pour charger des instances pré-entraînées de tokenizers et modèles respectivement. Cela vous permettra de charger la bonne architecture à chaque fois. Dans le prochain [tutoriel](preprocessing), vous apprenez à utiliser un tokenizer, processeur d'image, extracteur de caractéristiques et processeur pour pré-traiter un jeu de données pour le fine-tuning.
+</tf>
+</frameworkcontent>
diff --git a/docs/source/fr/installation.md b/docs/source/fr/installation.md
new file mode 100644
index 00000000000000..f529c4e201dad3
--- /dev/null
+++ b/docs/source/fr/installation.md
@@ -0,0 +1,258 @@
+<!---
+Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Installation
+
+Installez 🤗 Transformers pour n'importe quelle librairie d'apprentissage profond avec laquelle vous avez l'habitude de travaillez, configurez votre cache et configurez 🤗 Transformers pour un usage hors ligne (facultatif).
+
+🤗 Transformers est testé avec Python 3.6+, PyTorch 1.1.0+, TensorFlow 2.0+ et Flax.
+Consulter les instructions d'installation ci-dessous pour la librairie d'apprentissage profond que vous utilisez:
+
+  * Instructions d'installation pour [PyTorch](https://pytorch.org/get-started/locally/).
+  * Instructions d'installation pour [TensorFlow 2.0](https://www.tensorflow.org/install/pip).
+  * Instructions d'installation pour [Flax](https://flax.readthedocs.io/en/latest/).
+
+## Installation avec pip
+
+Vous devriez installer 🤗 Transformers dans un [environnement virtuel](https://docs.python.org/3/library/venv.html).
+Si vous n'êtes pas à l'aise avec les environnements virtuels, consultez ce [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
+Utiliser un environnement virtuel permet de facilement gérer différents projets et d'éviter des erreurs de compatibilité entre les différentes dépendances.
+
+Commencez par créer un environnement virtuel dans l'espace de travail de votre projet :
+
+```bash
+python -m venv .env
+```
+
+Activez l'environnement virtuel. Sur Linux ou MacOs :
+
+```bash
+source .env/bin/activate
+```
+
+Activez l'environnement virtuel sur Windows :
+
+```bash
+.env/Scripts/activate
+```
+
+Maintenant, 🤗 Transformers peut être installé avec la commande suivante :
+
+```bash
+pip install transformers
+```
+
+Pour une utilisation avec CPU seulement, 🤗 Transformers et la librairie d'apprentissage profond de votre choix peuvent être installés en une seule ligne.
+Par exemple, installez 🤗 Transformers et PyTorch avec la commande suivante :
+
+```bash
+pip install 'transformers[torch]'
+```
+
+🤗 Transformers et TensorFlow 2.0 :
+
+```bash
+pip install 'transformers[tf-cpu]'
+```
+
+<Tip warning={true}>
+
+Pour les architectures mac M1 / ARM
+
+Vous devez installer les outils suivants avant d'installer TensorFLow 2.0
+
+```
+brew install cmake
+brew install pkg-config
+```
+
+</Tip>
+
+🤗 Transformers et Flax :
+
+```bash
+pip install 'transformers[flax]'
+```
+
+Vérifiez que 🤗 Transformers a bien été installé avec la commande suivante. La commande va télécharger un modèle pré-entraîné :
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
+```
+
+Le label et score sont ensuite affichés :
+
+```bash
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
+```
+
+## Installation depuis le code source
+
+Installez 🤗 Transformers depuis le code source avec la commande suivante :
+
+```bash
+pip install git+https://github.com/huggingface/transformers
+```
+
+Cette commande installe la version depuis la branche `main` au lieu de la dernière version stable. La version de la branche `main` est utile pour avoir les derniers développements. Par exemple, si un bug a été résolu depuis la dernière version stable mais n'a pas encore été publié officiellement. Cependant, cela veut aussi dire que la version de la branche `main` n'est pas toujours stable. Nous nous efforçons de maintenir la version de la branche `main` opérationnelle, et la plupart des problèmes sont généralement résolus en l'espace de quelques heures ou d'un jour. Si vous recontrez un problème, n'hésitez pas à créer une [Issue](https://github.com/huggingface/transformers/issues) pour que l'on puisse trouver une solution au plus vite !
+
+Vérifiez que 🤗 Transformers a bien été installé avec la commande suivante :
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I love you'))"
+```
+
+## Installation modifiable
+
+Vous aurez besoin d'une installation modifiable si vous le souhaitez :
+
+  * Utiliser la version de la branche `main` du code source.
+  * Contribuer à 🤗 Transformers et vouler tester vos modifications du code source.
+
+Clonez le projet et installez 🤗 Transformers avec les commandes suivantes :
+
+```bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install -e .
+```
+
+Ces commandes créent des liens entre le dossier où le projet a été cloné et les chemins de vos librairies Python. Python regardera maintenant dans le dossier que vous avez cloné en plus des dossiers où sont installées vos autres librairies. Par exemple, si vos librairies Python sont installées dans `~/anaconda3/envs/main/lib/python3.7/site-packages/`, Python cherchera aussi dans le dossier où vous avez cloné : `~/transformers/`.
+
+<Tip warning={true}>
+
+Vous devez garder le dossier `transformers` si vous voulez continuer d'utiliser la librairie.
+
+</Tip>
+
+Maintenant, vous pouvez facilement mettre à jour votre clone avec la dernière version de 🤗 Transformers en utilisant la commande suivante :
+
+```bash
+cd ~/transformers/
+git pull
+```
+
+Votre environnement Python utilisera la version de la branche `main` lors de la prochaine exécution.
+
+## Installation avec conda
+
+Installation via le canal `huggingface` de conda :
+
+```bash
+conda install -c huggingface transformers
+```
+
+## Configuration du cache
+
+Les modèles pré-entraînés sont téléchargés et mis en cache localement dans le dossier suivant : `~/.cache/huggingface/hub`. C'est le dossier par défaut donné par la variable d'environnement `TRANSFORMERS_CACHE`. Sur Windows, le dossier par défaut est `C:\Users\nom_utilisateur\.cache\huggingface\hub`. Vous pouvez modifier les variables d'environnement indiquées ci-dessous - par ordre de priorité - pour spécifier un dossier de cache différent :
+
+1. Variable d'environnement (par défaut) : `HUGGINGFACE_HUB_CACHE` ou `TRANSFORMERS_CACHE`.
+2. Variable d'environnement : `HF_HOME`.
+3. Variable d'environnement : `XDG_CACHE_HOME` + `/huggingface`.
+
+<Tip>
+
+🤗 Transformers utilisera les variables d'environnement `PYTORCH_TRANSFORMERS_CACHE` ou `PYTORCH_PRETRAINED_BERT_CACHE` si vous utilisez une version précédente de cette librairie et avez défini ces variables d'environnement, sauf si vous spécifiez la variable d'environnement `TRANSFORMERS_CACHE`.
+
+</Tip>
+
+## Mode hors ligne
+
+🤗 Transformers peut fonctionner dans un environnement cloisonné ou hors ligne en n'utilisant que des fichiers locaux. Définissez la variable d'environnement `TRANSFORMERS_OFFLINE=1` pour activer ce mode.
+
+<Tip>
+
+Ajoutez [🤗 Datasets](https://huggingface.co/docs/datasets/) à votre processus d'entraînement hors ligne en définissant la variable d'environnement `HF_DATASETS_OFFLINE=1`.
+
+</Tip>
+
+```bash
+HF_DATASETS_OFFLINE=1 TRANSFORMERS_OFFLINE=1 \
+python examples/pytorch/translation/run_translation.py --model_name_or_path t5-small --dataset_name wmt16 --dataset_config ro-en ...
+```
+
+Le script devrait maintenant s'exécuter sans rester en attente ou attendre une expiration, car il n'essaiera pas de télécharger des modèle sur le Hub.
+
+Vous pouvez aussi éviter de télécharger un modèle à chaque appel de la fonction [~PreTrainedModel.from_pretrained] en utilisant le paramètre [local_files_only]. Seuls les fichiers locaux sont chargés lorsque ce paramètre est activé (c.-à-d. `local_files_only=True`) :
+
+```py
+from transformers import T5Model
+
+model = T5Model.from_pretrained("./path/to/local/directory", local_files_only=True)
+```
+
+### Récupérer des modèles et des tokenizers pour une utilisation hors ligne
+
+Une autre option pour utiliser 🤗 Transformers hors ligne est de télécharger les fichiers à l'avance, puis d'utiliser les chemins locaux lorsque vous en avez besoin en mode hors ligne. Il existe trois façons de faire cela :
+
+  * Téléchargez un fichier via l'interface utilisateur sur le [Model Hub](https://huggingface.co/models) en cliquant sur l'icône ↓.
+
+    ![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/download-icon.png)
+
+  * Utilisez les fonctions [`PreTrainedModel.from_pretrained`] et [`PreTrainedModel.save_pretrained`] :
+
+    1. Téléchargez vos fichiers à l'avance avec [`PreTrainedModel.from_pretrained`]:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/T0_3B")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0_3B")
+    ```
+
+    2. Sauvegardez les fichiers dans un dossier de votre choix avec [`PreTrainedModel.save_pretrained`]:
+
+    ```py
+    >>> tokenizer.save_pretrained("./your/path/bigscience_t0")
+    >>> model.save_pretrained("./your/path/bigscience_t0")
+    ```
+
+    3. Maintenant, lorsque vous êtes hors ligne, rechargez vos fichiers avec [`PreTrainedModel.from_pretrained`] depuis le dossier où vous les avez sauvegardés :
+
+    ```py
+    >>> tokenizer = AutoTokenizer.from_pretrained("./your/path/bigscience_t0")
+    >>> model = AutoModel.from_pretrained("./your/path/bigscience_t0")
+    ```
+
+  * Téléchargez des fichiers de manière automatique avec la librairie [huggingface_hub](https://github.com/huggingface/huggingface_hub/tree/main/src/huggingface_hub) :
+
+    1. Installez la librairie `huggingface_hub`  dans votre environnement virtuel :
+
+    ```bash
+    python -m pip install huggingface_hub
+    ```
+
+    2. Utilisez la fonction [`hf_hub_download`](https://huggingface.co/docs/hub/adding-a-library#download-files-from-the-hub) pour télécharger un fichier vers un chemin de votre choix.  Par exemple, la commande suivante télécharge le fichier `config.json` du modèle [T0](https://huggingface.co/bigscience/T0_3B) vers le chemin de votre choix :
+
+    ```py
+    >>> from huggingface_hub import hf_hub_download
+
+    >>> hf_hub_download(repo_id="bigscience/T0_3B", filename="config.json", cache_dir="./your/path/bigscience_t0")
+    ```
+
+Une fois que votre fichier est téléchargé et caché localement, spécifiez son chemin local pour le charger et l'utiliser :
+
+```py
+>>> from transformers import AutoConfig
+
+>>> config = AutoConfig.from_pretrained("./your/path/bigscience_t0/config.json")
+```
+
+<Tip>
+
+Consultez la section [How to download files from the Hub (Comment télécharger des fichiers depuis le Hub)](https://huggingface.co/docs/hub/how-to-downstream) pour plus de détails sur le téléchargement de fichiers stockés sur le Hub.
+
+</Tip>
diff --git a/docs/source/it/converting_tensorflow_models.md b/docs/source/it/converting_tensorflow_models.md
index 04398636359ce5..f6326daa735fbe 100644
--- a/docs/source/it/converting_tensorflow_models.md
+++ b/docs/source/it/converting_tensorflow_models.md
@@ -104,21 +104,6 @@ transformers-cli convert --model_type gpt2 \
   [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 ```
 
-## Transformer-XL
-
-
-Ecco un esempio del processo di conversione di un modello Transformer-XL pre-allenato 
-(vedi [qui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models)):
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
 ## XLNet
 
 Ecco un esempio del processo di conversione di un modello XLNet pre-allenato:
diff --git a/docs/source/it/perf_hardware.md b/docs/source/it/perf_hardware.md
index a579362e2b1b9d..dd1187a01b5938 100644
--- a/docs/source/it/perf_hardware.md
+++ b/docs/source/it/perf_hardware.md
@@ -134,7 +134,7 @@ Ecco il codice benchmark completo e gli output:
 ```bash
 # DDP w/ NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
@@ -143,7 +143,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch
 
 # DDP w/o NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
diff --git a/docs/source/it/preprocessing.md b/docs/source/it/preprocessing.md
index 94578dfe166b77..76addd2aa0ea3c 100644
--- a/docs/source/it/preprocessing.md
+++ b/docs/source/it/preprocessing.md
@@ -194,7 +194,7 @@ Gli input audio sono processati in modo differente rispetto al testo, ma l'obiet
 pip install datasets
 ```
 
-Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub.html) per avere maggiori dettagli su come caricare un dataset):
+Carica il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) (vedi il 🤗 [Datasets tutorial](https://huggingface.co/docs/datasets/load_hub) per avere maggiori dettagli su come caricare un dataset):
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -233,7 +233,7 @@ Per esempio, il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds1
  'sampling_rate': 8000}
 ```
 
-1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz:
+1. Usa il metodo di 🤗 Datasets' [`cast_column`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.cast_column) per alzare la frequenza di campionamento a 16kHz:
 
 ```py
 >>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
@@ -370,7 +370,7 @@ Per le attività di visione, è usuale aggiungere alcuni tipi di data augmentati
 ...     return examples
 ```
 
-3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)per applicare al volo la trasformazione:
+3. Poi utilizza 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process#format-transform)per applicare al volo la trasformazione:
 
 ```py
 >>> dataset.set_transform(transforms)
diff --git a/docs/source/it/quicktour.md b/docs/source/it/quicktour.md
index f0e981d18eb77d..07e7a2974a1fbc 100644
--- a/docs/source/it/quicktour.md
+++ b/docs/source/it/quicktour.md
@@ -125,7 +125,7 @@ Crea una [`pipeline`] con il compito che vuoi risolvere e con il modello che vuo
 ... )
 ```
 
-Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart.html) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+Poi, carica un dataset (vedi 🤗 Datasets [Quick Start](https://huggingface.co/docs/datasets/quickstart) per maggiori dettagli) sul quale vuoi iterare. Per esempio, carichiamo il dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
 
 ```py
 >>> from datasets import load_dataset, Audio
diff --git a/docs/source/it/run_scripts.md b/docs/source/it/run_scripts.md
index 327eb9374d3873..c376ff32c2a884 100644
--- a/docs/source/it/run_scripts.md
+++ b/docs/source/it/run_scripts.md
@@ -130,7 +130,7 @@ Il [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) supp
 - Imposta un numero di GPU da usare con l'argomento `nproc_per_node`.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/it/training.md b/docs/source/it/training.md
index be0883f07b7715..503a43321799e1 100644
--- a/docs/source/it/training.md
+++ b/docs/source/it/training.md
@@ -43,7 +43,7 @@ Inizia caricando il dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
 ```
 
-Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process.html#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset:
+Come già sai, hai bisogno di un tokenizer per processare il testo e includere una strategia di padding e truncation per gestire sequenze di lunghezza variabile. Per processare il dataset in un unico passo, usa il metodo [`map`](https://huggingface.co/docs/datasets/process#map) di 🤗 Datasets che applica la funzione di preprocessing all'intero dataset:
 
 ```py
 >>> from transformers import AutoTokenizer
@@ -103,7 +103,7 @@ Specifica dove salvare i checkpoints del tuo addestramento:
 
 ### Metriche
 
-[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics.html) per maggiori informazioni):
+[`Trainer`] non valuta automaticamente le performance del modello durante l'addestramento. Dovrai passare a [`Trainer`] una funzione che calcola e restituisce le metriche. La libreria 🤗 Datasets mette a disposizione una semplice funzione [`accuracy`](https://huggingface.co/metrics/accuracy) che puoi caricare con la funzione `load_metric` (guarda questa [esercitazione](https://huggingface.co/docs/datasets/metrics) per maggiori informazioni):
 
 ```py
 >>> import numpy as np
@@ -346,7 +346,7 @@ Per tenere traccia dei tuoi progressi durante l'addestramento, usa la libreria [
 
 ### Metriche
 
-Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine.
+Proprio come è necessario aggiungere una funzione di valutazione del [`Trainer`], è necessario fare lo stesso quando si scrive il proprio ciclo di addestramento. Ma invece di calcolare e riportare la metrica alla fine di ogni epoca, questa volta accumulerai tutti i batch con [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=add_batch#datasets.Metric.add_batch) e calcolerai la metrica alla fine.
 
 ```py
 >>> metric = load_metric("accuracy")
diff --git a/docs/source/ja/_toctree.yml b/docs/source/ja/_toctree.yml
index df686b475dab35..9766571c6b87eb 100644
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@@ -29,11 +29,76 @@
     title: LLM を使用した生成
   title: Tutorials
 - sections:
+  - isExpanded: false
+    sections:
+    - local: tasks/sequence_classification
+      title: テキストの分類
+    - local: tasks/token_classification
+      title: トークンの分類
+    - local: tasks/question_answering
+      title: 質疑応答
+    - local: tasks/language_modeling
+      title: 因果言語モデリング
+    - local: tasks/masked_language_modeling
+      title: マスクされた言語モデリング
+    - local: tasks/translation
+      title: 翻訳
+    - local: tasks/summarization
+      title: 要約
+    - local: tasks/multiple_choice
+      title: 複数の選択肢
+    title: 自然言語処理
+  - isExpanded: false
+    sections:
+    - local: tasks/audio_classification
+      title: 音声の分類
+    - local: tasks/asr
+      title: 自動音声認識
+    title: オーディオ
+  - isExpanded: false
+    sections:
+    - local: tasks/image_classification
+      title: 画像分類
+    - local: tasks/semantic_segmentation
+      title: セマンティックセグメンテーション
+    - local: tasks/video_classification
+      title: ビデオの分類
+    - local: tasks/object_detection
+      title: 物体検出
+    - local: tasks/zero_shot_object_detection
+      title: ゼロショット物体検出
+    - local: tasks/zero_shot_image_classification
+      title: ゼロショット画像分類
+    - local: tasks/monocular_depth_estimation
+      title: 深さの推定
+    - local: tasks/image_to_image
+      title: 画像から画像へ
+    - local: tasks/knowledge_distillation_for_image_classification
+      title: コンピュータビジョンのための知識の蒸留
+    title: コンピュータビジョン
+  - isExpanded: false
+    sections:
+    - local: tasks/image_captioning
+      title: 画像のキャプション
+    - local: tasks/document_question_answering
+      title: 文書の質問への回答
+    - local: tasks/visual_question_answering
+      title: 視覚的な質問への回答
+    - local: tasks/text-to-speech
+      title: テキスト読み上げ
+    title: マルチモーダル
   - isExpanded: false
     sections:
     - local: generation_strategies
       title: 生成戦略をカスタマイズする
-    title: Generation
+    title: 世代
+  - isExpanded: false
+    sections:
+    - local: tasks/idefics
+      title: IDEFICS を使用したイメージ タスク
+    - local: tasks/prompting
+      title: LLM プロンプト ガイド
+    title: プロンプト
   title: Task Guides
 - sections:
   - local: fast_tokenizers
@@ -185,11 +250,70 @@
       sections:
       - local: model_doc/albert
         title: ALBERT
+      - local: model_doc/bart
+        title: BART
+      - local: model_doc/barthez
+        title: BARThez
+      - local: model_doc/bartpho
+        title: BARTpho
+      - local: model_doc/bert
+        title: BERT
+      - local: model_doc/bert-generation
+        title: BertGeneration
+      - local: model_doc/bert-japanese
+        title: BertJapanese
+      - local: model_doc/bertweet
+        title: Bertweet
+      - local: model_doc/big_bird
+        title: BigBird
+      - local: model_doc/bigbird_pegasus
+        title: BigBirdPegasus
+      - local: model_doc/biogpt
+        title: BioGpt
+      - local: model_doc/blenderbot
+        title: Blenderbot
+      - local: model_doc/blenderbot-small
+        title: Blenderbot Small
+      - local: model_doc/bloom
+        title: BLOOM
+      - local: model_doc/bort
+        title: BORT
+      - local: model_doc/byt5
+        title: ByT5
+      - local: model_doc/camembert
+        title: CamemBERT
+      - local: model_doc/canine
+        title: CANINE
+      - local: model_doc/codegen
+        title: CodeGen
+      - local: model_doc/code_llama
+        title: CodeLlama
+      - local: model_doc/convbert
+        title: ConvBERT
+      - local: model_doc/cpm
+        title: CPM
       title: 文章モデル
+    - isExpanded: false
+      sections:
+      - local: model_doc/beit
+        title: BEiT
+      - local: model_doc/bit
+        title: BiT
+      - local: model_doc/conditional_detr
+        title: Conditional DETR
+      - local: model_doc/convnext
+        title: ConvNeXT
+      - local: model_doc/convnextv2
+        title: ConvNeXTV2
+      title: ビジョンモデル
     - isExpanded: false
       sections:
       - local: model_doc/audio-spectrogram-transformer
         title: Audio Spectrogram Transformer
+      - local: model_doc/bark
+        title: Bark
+      - local: model_doc/clap
+        title: CLAP
       title: 音声モデル
     - isExpanded: false
       sections:
@@ -197,6 +321,22 @@
         title: ALIGN
       - local: model_doc/altclip
         title: AltCLIP
+      - local: model_doc/blip
+        title: BLIP
+      - local: model_doc/blip-2
+        title: BLIP-2
+      - local: model_doc/bridgetower
+        title: BridgeTower
+      - local: model_doc/bros
+        title: BROS
+      - local: model_doc/chinese_clip
+        title: Chinese-CLIP
+      - local: model_doc/clip
+        title: CLIP
+      - local: model_doc/clipseg
+        title: CLIPSeg
+      - local: model_doc/clvp
+        title: CLVP
       title: マルチモーダルモデル
     - isExpanded: false
       sections:
diff --git a/docs/source/ja/main_classes/trainer.md b/docs/source/ja/main_classes/trainer.md
index 4c1ce95ca38aa6..f18c1b86809f99 100644
--- a/docs/source/ja/main_classes/trainer.md
+++ b/docs/source/ja/main_classes/trainer.md
@@ -196,7 +196,7 @@ _python_、_numpy_、および _pytorch_ の RNG 状態は、そのチェック
 [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.Parallel.DistributedDataParallel.html) を使用して GPU のサブセットのみを使用する場合、使用する GPU の数を指定するだけです。 。たとえば、GPU が 4 つあるが、最初の 2 つを使用したい場合は、次のようにします。
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
+torchrun --nproc_per_node=2  trainer-program.py ...
 ```
 
 [`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/microsoft/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：
@@ -209,7 +209,7 @@ accelerate launch --num_processes 2 trainer-program.py ...
 deepspeed --num_gpus 2 trainer-program.py ...
 ```
 
-これらのランチャーを使用するために、Accelerate または [Deepspeed 統合](Deepspeed) 機能を使用する必要はありません。
+これらのランチャーを使用するために、Accelerate または [Deepspeed 統合](deepspeed) 機能を使用する必要はありません。
 
 
 これまでは、プログラムに使用する GPU の数を指示できました。次に、特定の GPU を選択し、その順序を制御する方法について説明します。
@@ -223,7 +223,7 @@ deepspeed --num_gpus 2 trainer-program.py ...
 たとえば、4 つの GPU (0、1、2、3) があるとします。物理 GPU 0 と 2 のみで実行するには、次のようにします。
 
 ```bash
-CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
+CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
 ```
 
 したがって、pytorch は 2 つの GPU のみを認識し、物理 GPU 0 と 2 はそれぞれ `cuda:0` と `cuda:1` にマッピングされます。
@@ -231,7 +231,7 @@ CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py .
 順序を変更することもできます。
 
 ```bash
-CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
+CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
 ```
 
 ここでは、物理 GPU 0 と 2 がそれぞれ`cuda:1`と`cuda:0`にマッピングされています。
@@ -253,7 +253,7 @@ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0,2
-python -m torch.distributed.launch trainer-program.py ...
+torchrun trainer-program.py ...
 ```
 
 ただし、この方法では、以前に環境変数を設定したことを忘れて、なぜ間違った GPU が使用されているのか理解できない可能性があるため、混乱を招く可能性があります。したがって、このセクションのほとんどの例で示されているように、同じコマンド ラインで特定の実行に対してのみ環境変数を設定するのが一般的です。
diff --git a/docs/source/ja/model_doc/align.md b/docs/source/ja/model_doc/align.md
index 6e62c3d9f4ca68..84496e605def92 100644
--- a/docs/source/ja/model_doc/align.md
+++ b/docs/source/ja/model_doc/align.md
@@ -51,10 +51,10 @@ inputs = processor(text=candidate_labels, images=image, return_tensors="pt")
 with torch.no_grad():
     outputs = model(**inputs)
 
-# これは画像-テキスト類似度スコア
+# this is the image-text similarity score
 logits_per_image = outputs.logits_per_image
 
-# Softmaxを取ることで各ラベルの確率を得られる
+# we can take the softmax to get the label probabilities
 probs = logits_per_image.softmax(dim=1)
 print(probs)
 ```
diff --git a/docs/source/ja/model_doc/altclip.md b/docs/source/ja/model_doc/altclip.md
index 232b3645544f39..87cf6cc17d6a0b 100644
--- a/docs/source/ja/model_doc/altclip.md
+++ b/docs/source/ja/model_doc/altclip.md
@@ -52,8 +52,8 @@ Transformerエンコーダーに画像を与えるには、各画像を固定サ
 >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
 
 >>> outputs = model(**inputs)
->>> logits_per_image = outputs.logits_per_image  # これは画像-テキスト類似度スコア
->>> probs = logits_per_image.softmax(dim=1)  # Softmaxを取ることで各ラベルの確率を得られる
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
 ```
 
 <Tip>
diff --git a/docs/source/ja/model_doc/audio-spectrogram-transformer.md b/docs/source/ja/model_doc/audio-spectrogram-transformer.md
index efbadbd4bae67b..a5107b14f83a98 100644
--- a/docs/source/ja/model_doc/audio-spectrogram-transformer.md
+++ b/docs/source/ja/model_doc/audio-spectrogram-transformer.md
@@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.
 
 ## 概要
 
-Audio Spectrogram Transformerモデルは、「[AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)」という論文でYuan Gong、Yu-An Chung、James Glassによって提案されました。これは、音声を画像（スペクトログラム）に変換することで、音声に[Vision Transformer](vit)を適用します。このモデルは音声分類において最先端の結果を得ています。
+Audio Spectrogram Transformerモデルは、[AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778)という論文でYuan Gong、Yu-An Chung、James Glassによって提案されました。これは、音声を画像（スペクトログラム）に変換することで、音声に[Vision Transformer](vit)を適用します。このモデルは音声分類において最先端の結果を得ています。
 
 論文の要旨は以下の通りです：
 
@@ -35,7 +35,7 @@ alt="drawing" width="600"/>
 ## 使用上のヒント
 
 - 独自のデータセットでAudio Spectrogram Transformer（AST）をファインチューニングする場合、入力の正規化（入力の平均を0、標準偏差を0.5にすること）処理することが推奨されます。[`ASTFeatureExtractor`]はこれを処理します。デフォルトではAudioSetの平均と標準偏差を使用していることに注意してください。著者が下流のデータセットの統計をどのように計算しているかは、[`ast/src/get_norm_stats.py`](https://github.com/YuanGongND/ast/blob/master/src/get_norm_stats.py)で確認することができます。
-- ASTは低い学習率が必要であり（著者は[PSLA論文](https://arxiv.org/abs/2102.01243)で提案されたCNNモデルに比べて10倍小さい学習率を使用しています）、素早く収束するため、タスクに適した学習率と学習率スケジューラーを探すことをお勧めします。
+- ASTは低い学習率が必要であり 著者は[PSLA論文](https://arxiv.org/abs/2102.01243)で提案されたCNNモデルに比べて10倍小さい学習率を使用しています）、素早く収束するため、タスクに適した学習率と学習率スケジューラーを探すことをお勧めします。
 
 ## 参考資料
 
diff --git a/docs/source/ja/model_doc/auto.md b/docs/source/ja/model_doc/auto.md
index b104e4c99fe3a8..c6775493baae39 100644
--- a/docs/source/ja/model_doc/auto.md
+++ b/docs/source/ja/model_doc/auto.md
@@ -43,7 +43,7 @@ AutoModel.register(NewModelConfig, NewModel)
 
 <Tip warning={true}>
 
-あなたの`NewModelConfig`が[`~transformer.PretrainedConfig`]のサブクラスである場合、その`model_type`属性がコンフィグを登録するときに使用するキー（ここでは`"new-model"`）と同じに設定されていることを確認してください。
+あなたの`NewModelConfig`が[`~transformers.PretrainedConfig`]のサブクラスである場合、その`model_type`属性がコンフィグを登録するときに使用するキー（ここでは`"new-model"`）と同じに設定されていることを確認してください。
 
 同様に、あなたの`NewModel`が[`PreTrainedModel`]のサブクラスである場合、その`config_class`属性がモデルを登録する際に使用するクラス（ここでは`NewModelConfig`）と同じに設定されていることを確認してください。
 
diff --git a/docs/source/ja/model_doc/bark.md b/docs/source/ja/model_doc/bark.md
new file mode 100644
index 00000000000000..508b8938889bae
--- /dev/null
+++ b/docs/source/ja/model_doc/bark.md
@@ -0,0 +1,198 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Bark
+
+## Overview
+
+Bark は、[suno-ai/bark](https://github.com/suno-ai/bark) で Suno AI によって提案されたトランスフォーマーベースのテキスト読み上げモデルです。
+
+
+Bark は 4 つの主要なモデルで構成されています。
+
+- [`BarkSemanticModel`] ('テキスト'モデルとも呼ばれる): トークン化されたテキストを入力として受け取り、テキストの意味を捉えるセマンティック テキスト トークンを予測する因果的自己回帰変換モデル。
+- [`BarkCoarseModel`] ('粗い音響' モデルとも呼ばれる): [`BarkSemanticModel`] モデルの結果を入力として受け取る因果的自己回帰変換器。 EnCodec に必要な最初の 2 つのオーディオ コードブックを予測することを目的としています。
+- [`BarkFineModel`] ('微細音響' モデル)、今回は非因果的オートエンコーダー トランスフォーマーで、以前のコードブック埋め込みの合計に基づいて最後のコードブックを繰り返し予測します。
+- [`EncodecModel`] からすべてのコードブック チャネルを予測したので、Bark はそれを使用して出力オーディオ配列をデコードします。
+
+最初の 3 つのモジュールはそれぞれ、特定の事前定義された音声に従って出力サウンドを調整するための条件付きスピーカー埋め込みをサポートできることに注意してください。
+
+### Optimizing Bark
+
+Bark は、コードを数行追加するだけで最適化でき、**メモリ フットプリントが大幅に削減**され、**推論が高速化**されます。
+
+#### Using half-precision
+
+モデルを半精度でロードするだけで、推論を高速化し、メモリ使用量を 50% 削減できます。
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+```
+
+#### Using 🤗 Better Transformer
+
+Better Transformer は、内部でカーネル融合を実行する 🤗 最適な機能です。パフォーマンスを低下させることなく、速度を 20% ～ 30% 向上させることができます。モデルを 🤗 Better Transformer にエクスポートするのに必要なコードは 1 行だけです。
+
+```python
+model =  model.to_bettertransformer()
+```
+
+この機能を使用する前に 🤗 Optimum をインストールする必要があることに注意してください。 [インストール方法はこちら](https://huggingface.co/docs/optimum/installation)
+
+#### Using CPU offload
+
+前述したように、Bark は 4 つのサブモデルで構成されており、オーディオ生成中に順番に呼び出されます。言い換えれば、1 つのサブモデルが使用されている間、他のサブモデルはアイドル状態になります。
+
+CUDA デバイスを使用している場合、メモリ フットプリントの 80% 削減による恩恵を受ける簡単な解決策は、アイドル状態の GPU のサブモデルをオフロードすることです。この操作は CPU オフロードと呼ばれます。 1行のコードで使用できます。
+
+```python
+model.enable_cpu_offload()
+```
+
+この機能を使用する前に、🤗 Accelerate をインストールする必要があることに注意してください。 [インストール方法はこちら](https://huggingface.co/docs/accelerate/basic_tutorials/install)
+
+#### Combining optimization techniques
+
+最適化手法を組み合わせて、CPU オフロード、半精度、🤗 Better Transformer をすべて一度に使用できます。
+
+```python
+from transformers import BarkModel
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+# load in fp16
+model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16).to(device)
+
+# convert to bettertransformer
+model = BetterTransformer.transform(model, keep_original_model=False)
+
+# enable CPU offload
+model.enable_cpu_offload()
+```
+
+推論最適化手法の詳細については、[こちら](https://huggingface.co/docs/transformers/perf_infer_gpu_one) をご覧ください。
+
+### Tips
+
+Suno は、多くの言語で音声プリセットのライブラリを提供しています [こちら](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c)。
+これらのプリセットは、ハブ [こちら](https://huggingface.co/suno/bark-small/tree/main/speaker_embeddings) または [こちら](https://huggingface.co/suno/bark/tree/main/speaker_embeddings)。
+
+```python
+>>> from transformers import AutoProcessor, BarkModel
+
+>>> processor = AutoProcessor.from_pretrained("suno/bark")
+>>> model = BarkModel.from_pretrained("suno/bark")
+
+>>> voice_preset = "v2/en_speaker_6"
+
+>>> inputs = processor("Hello, my dog is cute", voice_preset=voice_preset)
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+Bark は、非常にリアルな **多言語** 音声だけでなく、音楽、背景ノイズ、単純な効果音などの他の音声も生成できます。
+
+```python
+>>> # Multilingual speech - simplified Chinese
+>>> inputs = processor("惊人的！我会说中文")
+
+>>> # Multilingual speech - French - let's use a voice_preset as well
+>>> inputs = processor("Incroyable! Je peux générer du son.", voice_preset="fr_speaker_5")
+
+>>> # Bark can also generate music. You can help it out by adding music notes around your lyrics.
+>>> inputs = processor("♪ Hello, my dog is cute ♪")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+このモデルは、笑う、ため息、泣くなどの**非言語コミュニケーション**を生成することもできます。
+
+
+```python
+>>> # Adding non-speech cues to the input text
+>>> inputs = processor("Hello uh ... [clears throat], my dog is cute [laughter]")
+
+>>> audio_array = model.generate(**inputs)
+>>> audio_array = audio_array.cpu().numpy().squeeze()
+```
+
+オーディオを保存するには、モデル設定と scipy ユーティリティからサンプル レートを取得するだけです。
+
+```python
+>>> from scipy.io.wavfile import write as write_wav
+
+>>> # save audio to disk, but first take the sample rate from the model config
+>>> sample_rate = model.generation_config.sample_rate
+>>> write_wav("bark_generation.wav", sample_rate, audio_array)
+```
+
+このモデルは、[Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe) および [Sanchit Gandhi (sanchit-gandhi)](https://github.com/sanchit-gandhi) によって提供されました。
+元のコードは [ここ](https://github.com/suno-ai/bark) にあります。
+
+## BarkConfig
+
+[[autodoc]] BarkConfig
+    - all
+
+## BarkProcessor
+
+[[autodoc]] BarkProcessor
+    - all
+    - __call__
+
+## BarkModel
+
+[[autodoc]] BarkModel
+    - generate
+    - enable_cpu_offload
+
+## BarkSemanticModel
+
+[[autodoc]] BarkSemanticModel
+    - forward
+
+## BarkCoarseModel
+
+[[autodoc]] BarkCoarseModel
+    - forward
+
+## BarkFineModel
+
+[[autodoc]] BarkFineModel
+    - forward
+
+## BarkCausalModel
+
+[[autodoc]] BarkCausalModel
+    - forward
+
+## BarkCoarseConfig
+
+[[autodoc]] BarkCoarseConfig
+    - all
+
+## BarkFineConfig
+
+[[autodoc]] BarkFineConfig
+    - all
+
+## BarkSemanticConfig
+
+[[autodoc]] BarkSemanticConfig
+    - all
diff --git a/docs/source/ja/model_doc/bart.md b/docs/source/ja/model_doc/bart.md
new file mode 100644
index 00000000000000..0f2c3fb006e137
--- /dev/null
+++ b/docs/source/ja/model_doc/bart.md
@@ -0,0 +1,223 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BART
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bart">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bart-large-mnli">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+**免責事項:** 何か奇妙なものを見つけた場合は、[Github 問題](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) を提出し、割り当ててください。
+@patrickvonplaten
+
+## Overview
+
+Bart モデルは、[BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation、
+翻訳と理解](https://arxiv.org/abs/1910.13461) Mike Lewis、Yinhan Liu、Naman Goyal、Marjan 著
+ガズビニネジャド、アブデルラフマン・モハメド、オメル・レヴィ、ベス・ストヤノフ、ルーク・ゼトルモイヤー、2019年10月29日。
+
+要約によると、
+
+- Bart は、双方向エンコーダ (BERT など) を備えた標準の seq2seq/機械翻訳アーキテクチャを使用します。
+  左から右へのデコーダ (GPT など)。
+- 事前トレーニング タスクには、元の文の順序をランダムにシャッフルし、新しい埋め込みスキームが含まれます。
+  ここで、テキストの範囲は単一のマスク トークンに置き換えられます。
+- BART は、テキスト生成用に微調整した場合に特に効果的ですが、理解タスクにも適しています。それ
+  RoBERTa のパフォーマンスを GLUE および SQuAD の同等のトレーニング リソースと同等にし、新たな成果を達成します。
+  さまざまな抽象的な対話、質問応答、要約タスクに関する最先端の結果が得られ、成果が得られます。
+  ルージュは最大6枚まで。
+
+チップ：
+
+- BART は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
+  左。
+- エンコーダーとデコーダーを備えたシーケンスツーシーケンス モデル。エンコーダには破損したバージョンのトークンが供給され、デコーダには元のトークンが供給されます（ただし、通常のトランスフォーマー デコーダと同様に、将来のワードを隠すためのマスクがあります）。次の変換の構成は、エンコーダーの事前トレーニング タスクに適用されます。
+
+  * ランダムなトークンをマスクします (BERT と同様)
+  * ランダムなトークンを削除します
+  * k 個のトークンのスパンを 1 つのマスク トークンでマスクします (0 トークンのスパンはマスク トークンの挿入です)
+  * 文を並べ替えます
+  * ドキュメントを回転して特定のトークンから開始するようにします
+
+このモデルは [sshleifer](https://huggingface.co/sshleifer) によって提供されました。著者のコードは [ここ](https://github.com/pytorch/fairseq/tree/master/examples/bart) にあります。
+
+### Examples
+
+- シーケンス間タスク用の BART およびその他のモデルを微調整するための例とスクリプトは、次の場所にあります。
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md)。
+- Hugging Face `datasets` を使用して [`BartForConditionalGeneration`] をトレーニングする方法の例
+  オブジェクトは、この [フォーラム ディスカッション](https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904) で見つけることができます。
+- [抽出されたチェックポイント](https://huggingface.co/models?search=distilbart) は、この [論文](https://arxiv.org/abs/2010.13002) で説明されています。
+
+## Implementation Notes
+
+- Bart はシーケンスの分類に `token_type_ids` を使用しません。 [`BartTokenizer`] を使用するか、
+  [`~BartTokenizer.encode`] を使用して適切に分割します。
+- [`BartModel`] のフォワードパスは、渡されなかった場合、`decoder_input_ids` を作成します。
+  これは、他のモデリング API とは異なります。この機能の一般的な使用例は、マスクの塗りつぶしです。
+- モデルの予測は、次の場合に元の実装と同一になるように意図されています。
+  `forced_bos_token_id=0`。ただし、これは、渡す文字列が次の場合にのみ機能します。
+  [`fairseq.encode`] はスペースで始まります。
+- [`~generation.GenerationMixin.generate`] は、次のような条件付き生成タスクに使用する必要があります。
+  要約については、その docstring の例を参照してください。
+- *facebook/bart-large-cnn* 重みをロードするモデルには `mask_token_id` がないか、実行できません。
+  マスクを埋めるタスク。
+
+## Mask Filling
+
+`facebook/bart-base` および `facebook/bart-large` チェックポイントを使用して、マルチトークン マスクを埋めることができます。
+
+```python
+from transformers import BartForConditionalGeneration, BartTokenizer
+
+model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
+tok = BartTokenizer.from_pretrained("facebook/bart-large")
+example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+batch = tok(example_english_phrase, return_tensors="pt")
+generated_ids = model.generate(batch["input_ids"])
+assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [
+    "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria"
+]
+```
+
+## Resources
+
+BART を始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+<PipelineTag pipeline="summarization"/>
+
+- に関するブログ投稿 [分散トレーニング: 🤗 Transformers と Amazon SageMaker を使用した要約のための BART/T5 のトレーニング](https://huggingface.co/blog/sagemaker-distributed-training-seq2seq)。
+- 方法に関するノートブック [blurr を使用して fastai で要約するために BART を微調整する](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb). 🌎 🌎
+- 方法に関するノートブック [トレーナー クラスを使用して 2 つの言語で要約するために BART を微調整する](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)。 🌎
+- [`BartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)。
+- [`TFBartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/summarization) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)。
+- [`FlaxBartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/summarization) でサポートされています。
+- [要約](https://huggingface.co/course/chapter7/5?fw=pt#summarization) 🤗 ハグフェイスコースの章。
+- [要約タスクガイド](../tasks/summarization.md)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) でサポートされており、 [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)。
+- [`TFBartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)。
+- [`FlaxBartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) および [ノートブック]( https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)。
+- [マスクされた言語モデリング](https://huggingface.co/course/chapter7/3?fw=pt) 🤗 顔ハグ コースの章。
+- [マスクされた言語モデリング タスク ガイド](../tasks/masked_lang_modeling)
+
+<PipelineTag pipeline="translation"/>
+
+- [ヒンディー語から英語への翻訳に Seq2SeqTrainer を使用して mBART を微調整する]方法に関するノート (https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)。 🌎
+- [`BartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/translation) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)。
+- [`TFBartForConditionalGeneration`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/translation) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)。
+- [翻訳タスクガイド](../tasks/translation)
+
+以下も参照してください。
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [抽出されたチェックポイント](https://huggingface.co/models?search=distilbart) は、この [論文](https://arxiv.org/abs/2010.13002) で説明されています。
+
+## BartConfig
+
+[[autodoc]] BartConfig
+    - all
+
+## BartTokenizer
+
+[[autodoc]] BartTokenizer
+    - all
+
+## BartTokenizerFast
+
+[[autodoc]] BartTokenizerFast
+    - all
+
+## BartModel
+
+[[autodoc]] BartModel
+    - forward
+
+## BartForConditionalGeneration
+
+[[autodoc]] BartForConditionalGeneration
+    - forward
+
+## BartForSequenceClassification
+
+[[autodoc]] BartForSequenceClassification
+    - forward
+
+## BartForQuestionAnswering
+
+[[autodoc]] BartForQuestionAnswering
+    - forward
+
+## BartForCausalLM
+
+[[autodoc]] BartForCausalLM
+    - forward
+
+## TFBartModel
+
+[[autodoc]] TFBartModel
+    - call
+
+## TFBartForConditionalGeneration
+
+[[autodoc]] TFBartForConditionalGeneration
+    - call
+
+## TFBartForSequenceClassification
+
+[[autodoc]] TFBartForSequenceClassification
+    - call
+
+## FlaxBartModel
+
+[[autodoc]] FlaxBartModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForConditionalGeneration
+
+[[autodoc]] FlaxBartForConditionalGeneration
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForSequenceClassification
+
+[[autodoc]] FlaxBartForSequenceClassification
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForQuestionAnswering
+
+[[autodoc]] FlaxBartForQuestionAnswering
+    - __call__
+    - encode
+    - decode
+
+## FlaxBartForCausalLM
+
+[[autodoc]] FlaxBartForCausalLM
+    - __call__
diff --git a/docs/source/ja/model_doc/barthez.md b/docs/source/ja/model_doc/barthez.md
new file mode 100644
index 00000000000000..94844c3f675d8a
--- /dev/null
+++ b/docs/source/ja/model_doc/barthez.md
@@ -0,0 +1,60 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARThez
+
+## Overview
+
+BARThez モデルは、Moussa Kamal Eddine、Antoine J.-P によって [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) で提案されました。ティクシエ、ミカリス・ヴァジルジャンニス、10月23日、
+2020年。
+
+論文の要約:
+
+
+*帰納的転移学習は、自己教師あり学習によって可能になり、自然言語処理全体を実行します。
+(NLP) 分野は、BERT や BART などのモデルにより、無数の自然言語に新たな最先端技術を確立し、嵐を巻き起こしています。
+タスクを理解すること。いくつかの注目すべき例外はありますが、利用可能なモデルと研究のほとんどは、
+英語を対象に実施されました。この作品では、フランス語用の最初の BART モデルである BARTez を紹介します。
+（我々の知る限りに）。 BARThez は、過去の研究から得た非常に大規模な単一言語フランス語コーパスで事前トレーニングされました
+BART の摂動スキームに合わせて調整しました。既存の BERT ベースのフランス語モデルとは異なり、
+CamemBERT と FlauBERT、BARThez は、エンコーダだけでなく、
+そのデコーダは事前トレーニングされています。 FLUE ベンチマークからの識別タスクに加えて、BARThez を新しい評価に基づいて評価します。
+この論文とともにリリースする要約データセット、OrangeSum。また、すでに行われている事前トレーニングも継続します。
+BARTHez のコーパス上で多言語 BART を事前訓練し、結果として得られるモデル (mBARTHez と呼ぶ) が次のことを示します。
+バニラの BARThez を大幅に強化し、CamemBERT や FlauBERT と同等かそれを上回ります。*
+
+このモデルは [moussakam](https://huggingface.co/moussakam) によって寄稿されました。著者のコードは[ここ](https://github.com/moussaKam/BARThez)にあります。
+
+<Tip>
+
+BARThez の実装は、トークン化を除いて BART と同じです。詳細については、[BART ドキュメント](bart) を参照してください。
+構成クラスとそのパラメータ。 BARThez 固有のトークナイザーについては以下に記載されています。
+
+</Tip>
+
+### Resources
+
+- BARThez は、BART と同様の方法でシーケンス間のタスクを微調整できます。以下を確認してください。
+  [examples/pytorch/summarization/](https://github.com/huggingface/transformers/tree/main/examples/pytorch/summarization/README.md)。
+
+
+## BarthezTokenizer
+
+[[autodoc]] BarthezTokenizer
+
+## BarthezTokenizerFast
+
+[[autodoc]] BarthezTokenizerFast
diff --git a/docs/source/ja/model_doc/bartpho.md b/docs/source/ja/model_doc/bartpho.md
new file mode 100644
index 00000000000000..a9575d821ef916
--- /dev/null
+++ b/docs/source/ja/model_doc/bartpho.md
@@ -0,0 +1,86 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARTpho
+
+## Overview
+
+BARTpho モデルは、Nguyen Luong Tran、Duong Minh Le、Dat Quoc Nguyen によって [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnam](https://arxiv.org/abs/2109.09701) で提案されました。
+
+論文の要約は次のとおりです。
+
+*BARTpho には、BARTpho_word と BARTpho_syllable の 2 つのバージョンがあり、初の公開された大規模な単一言語です。
+ベトナム語用に事前トレーニングされたシーケンスツーシーケンス モデル。当社の BARTpho は「大規模な」アーキテクチャと事前トレーニングを使用します
+シーケンス間ノイズ除去モデル BART のスキームなので、生成 NLP タスクに特に適しています。実験
+ベトナム語テキスト要約の下流タスクでは、自動評価と人間による評価の両方で、BARTpho が
+強力なベースライン mBART を上回り、最先端の性能を向上させます。将来を容易にするためにBARTphoをリリースします
+生成的なベトナム語 NLP タスクの研究と応用。*
+
+このモデルは [dqnguyen](https://huggingface.co/dqnguyen) によって提供されました。元のコードは [こちら](https://github.com/VinAIResearch/BARTpho) にあります。
+
+## Usage example
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable")
+
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
+
+>>> line = "Chúng tôi là những nghiên cứu viên."
+
+>>> input_ids = tokenizer(line, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     features = bartpho(**input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> from transformers import TFAutoModel
+
+>>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
+>>> input_ids = tokenizer(line, return_tensors="tf")
+>>> features = bartpho(**input_ids)
+```
+
+## Usage tips
+
+- mBARTに続いて、BARTphoはBARTの「大規模な」アーキテクチャを使用し、その上に追加の層正規化層を備えています。
+  エンコーダとデコーダの両方。したがって、[BART のドキュメント](bart) の使用例は、使用に適応する場合に使用されます。
+  BARTpho を使用する場合は、BART に特化したクラスを mBART に特化した対応するクラスに置き換えることによって調整する必要があります。
+  例えば：
+
+```python
+>>> from transformers import MBartForConditionalGeneration
+
+>>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
+>>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
+>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+>>> logits = bartpho(input_ids).logits
+>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+>>> probs = logits[0, masked_index].softmax(dim=0)
+>>> values, predictions = probs.topk(5)
+>>> print(tokenizer.decode(predictions).split())
+```
+
+- この実装はトークン化のみを目的としています。`monolingual_vocab_file`はベトナム語に特化した型で構成されています
+  多言語 XLM-RoBERTa から利用できる事前トレーニング済み SentencePiece モデル`vocab_file`から抽出されます。
+  他の言語 (サブワードにこの事前トレーニング済み多言語 SentencePiece モデル`vocab_file`を使用する場合)
+  セグメンテーションにより、独自の言語に特化した`monolingual_vocab_file`を使用して BartphoTokenizer を再利用できます。
+
+## BartphoTokenizer
+
+[[autodoc]] BartphoTokenizer
diff --git a/docs/source/ja/model_doc/beit.md b/docs/source/ja/model_doc/beit.md
new file mode 100644
index 00000000000000..45eb1efa5dd873
--- /dev/null
+++ b/docs/source/ja/model_doc/beit.md
@@ -0,0 +1,143 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BEiT
+
+## Overview
+
+BEiT モデルは、[BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) で提案されました。
+ハンボ・バオ、リー・ドン、フル・ウェイ。 BERT に触発された BEiT は、自己教師ありの事前トレーニングを作成した最初の論文です。
+ビジョン トランスフォーマー (ViT) は、教師付き事前トレーニングよりも優れたパフォーマンスを発揮します。クラスを予測するためにモデルを事前トレーニングするのではなく
+([オリジナルの ViT 論文](https://arxiv.org/abs/2010.11929) で行われたように) 画像の BEiT モデルは、次のように事前トレーニングされています。
+マスクされた OpenAI の [DALL-E モデル](https://arxiv.org/abs/2102.12092) のコードブックからビジュアル トークンを予測します
+パッチ。
+
+論文の要約は次のとおりです。
+
+*自己教師あり視覚表現モデル BEiT (Bidirectional Encoderpresentation) を導入します。
+イメージトランスフォーマーより。自然言語処理分野で開発されたBERTに倣い、マスク画像を提案します。
+ビジョントランスフォーマーを事前にトレーニングするためのモデリングタスク。具体的には、事前トレーニングでは各画像に 2 つのビューがあります。
+パッチ (16x16 ピクセルなど)、およびビジュアル トークン (つまり、個別のトークン)。まず、元の画像を「トークン化」して、
+ビジュアルトークン。次に、いくつかの画像パッチをランダムにマスクし、それらをバックボーンの Transformer に供給します。事前トレーニング
+目的は、破損したイメージ パッチに基づいて元のビジュアル トークンを回復することです。 BEiTの事前トレーニング後、
+事前トレーニングされたエンコーダーにタスク レイヤーを追加することで、ダウンストリーム タスクのモデル パラメーターを直接微調整します。
+画像分類とセマンティックセグメンテーションに関する実験結果は、私たちのモデルが競争力のある結果を達成することを示しています
+以前の事前トレーニング方法を使用して。たとえば、基本サイズの BEiT は、ImageNet-1K で 83.2% のトップ 1 精度を達成します。
+同じ設定でゼロからの DeiT トレーニング (81.8%) を大幅に上回りました。また、大型BEiTは
+86.3% は ImageNet-1K のみを使用しており、ImageNet-22K での教師付き事前トレーニングを使用した ViT-L (85.2%) を上回っています。*
+
+## Usage tips
+
+- BEiT モデルは通常のビジョン トランスフォーマーですが、教師ありではなく自己教師ありの方法で事前トレーニングされています。彼らは
+  ImageNet-1K および CIFAR-100 で微調整すると、[オリジナル モデル (ViT)](vit) と [データ効率の高いイメージ トランスフォーマー (DeiT)](deit) の両方を上回るパフォーマンスを発揮します。推論に関するデモノートブックもチェックできます。
+  カスタム データの微調整は [こちら](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/VisionTransformer) (置き換えるだけで済みます)
+  [`BeitImageProcessor`] による [`ViTFeatureExtractor`] と
+  [`ViTForImageClassification`] by [`BeitForImageClassification`])。
+- DALL-E の画像トークナイザーと BEiT を組み合わせる方法を紹介するデモ ノートブックも利用可能です。
+  マスクされた画像モデリングを実行します。 [ここ](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BEiT) で見つけることができます。
+- BEiT モデルは各画像が同じサイズ (解像度) であることを期待しているため、次のように使用できます。
+  [`BeitImageProcessor`] を使用して、モデルの画像のサイズを変更 (または再スケール) し、正規化します。
+- 事前トレーニングまたは微調整中に使用されるパッチ解像度と画像解像度の両方が名前に反映されます。
+  各チェックポイント。たとえば、`microsoft/beit-base-patch16-224`は、パッチ付きの基本サイズのアーキテクチャを指します。
+  解像度は 16x16、微調整解像度は 224x224 です。すべてのチェックポイントは [ハブ](https://huggingface.co/models?search=microsoft/beit) で見つけることができます。
+- 利用可能なチェックポイントは、(1) [ImageNet-22k](http://www.image-net.org/) で事前トレーニングされています (
+  1,400 万の画像と 22,000 のクラス) のみ、(2) ImageNet-22k でも微調整、または (3) [ImageNet-1k](http://www.image-net.org/challenges/LSVRC)でも微調整/2012/) (ILSVRC 2012 とも呼ばれ、130 万件のコレクション)
+  画像と 1,000 クラス)。
+- BEiT は、T5 モデルからインスピレーションを得た相対位置埋め込みを使用します。事前トレーニング中に、著者は次のことを共有しました。
+  いくつかの自己注意層間の相対的な位置の偏り。微調整中、各レイヤーの相対位置
+  バイアスは、事前トレーニング後に取得された共有相対位置バイアスで初期化されます。ご希望の場合は、
+  モデルを最初から事前トレーニングするには、`use_relative_position_bias` または
+  追加するには、[`BeitConfig`] の `use_relative_position_bias` 属性を `True` に設定します。
+  位置の埋め込み。
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/beit_architecture.jpg"
+alt="drawing" width="600"/>
+
+<small> BEiT の事前トレーニング。 <a href="https://arxiv.org/abs/2106.08254">元の論文から抜粋。</a> </small>
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。このモデルの JAX/FLAX バージョンは、
+[kamalkraj](https://huggingface.co/kamalkraj) による投稿。元のコードは [ここ](https://github.com/microsoft/unilm/tree/master/beit) にあります。
+
+## Resources
+
+BEiT の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BeitForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
+- 参照: [画像分類タスク ガイド](../tasks/image_classification)
+
+**セマンティック セグメンテーション**
+- [セマンティック セグメンテーション タスク ガイド](../tasks/semantic_segmentation)
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## BEiT specific outputs
+
+[[autodoc]] models.beit.modeling_beit.BeitModelOutputWithPooling
+
+[[autodoc]] models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling
+
+## BeitConfig
+
+[[autodoc]] BeitConfig
+
+## BeitFeatureExtractor
+
+[[autodoc]] BeitFeatureExtractor
+    - __call__
+    - post_process_semantic_segmentation
+
+## BeitImageProcessor
+
+[[autodoc]] BeitImageProcessor
+    - preprocess
+    - post_process_semantic_segmentation
+
+## BeitModel
+
+[[autodoc]] BeitModel
+    - forward
+
+## BeitForMaskedImageModeling
+
+[[autodoc]] BeitForMaskedImageModeling
+    - forward
+
+## BeitForImageClassification
+
+[[autodoc]] BeitForImageClassification
+    - forward
+
+## BeitForSemanticSegmentation
+
+[[autodoc]] BeitForSemanticSegmentation
+    - forward
+
+## FlaxBeitModel
+
+[[autodoc]] FlaxBeitModel
+    - __call__
+
+## FlaxBeitForMaskedImageModeling
+
+[[autodoc]] FlaxBeitForMaskedImageModeling
+    - __call__
+
+## FlaxBeitForImageClassification
+
+[[autodoc]] FlaxBeitForImageClassification
+    - __call__
diff --git a/docs/source/ja/model_doc/bert-generation.md b/docs/source/ja/model_doc/bert-generation.md
new file mode 100644
index 00000000000000..4a25ff5d9bc662
--- /dev/null
+++ b/docs/source/ja/model_doc/bert-generation.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BertGeneration
+
+## Overview
+
+BertGeneration モデルは、次を使用してシーケンス間のタスクに利用できる BERT モデルです。
+[Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) で提案されている [`EncoderDecoderModel`]
+タスク、Sascha Rothe、Sishi Nagayan、Aliaksei Severyn 著。
+
+論文の要約は次のとおりです。
+
+*大規模なニューラル モデルの教師なし事前トレーニングは、最近、自然言語処理に革命をもたらしました。による
+NLP 実践者は、公開されたチェックポイントからウォームスタートして、複数の項目で最先端の技術を推進してきました。
+コンピューティング時間を大幅に節約しながらベンチマークを実行します。これまでのところ、主に自然言語に焦点を当ててきました。
+タスクを理解する。この論文では、シーケンス生成のための事前トレーニングされたチェックポイントの有効性を実証します。私たちは
+公開されている事前トレーニング済み BERT と互換性のある Transformer ベースのシーケンス間モデルを開発しました。
+GPT-2 および RoBERTa チェックポイントを使用し、モデルの初期化の有用性について広範な実証研究を実施しました。
+エンコーダとデコーダ、これらのチェックポイント。私たちのモデルは、機械翻訳に関する新しい最先端の結果をもたらします。
+テキストの要約、文の分割、および文の融合。*
+
+## Usage examples and tips
+
+- モデルを [`EncoderDecoderModel`] と組み合わせて使用​​して、2 つの事前トレーニングされたモデルを活用できます。
+  後続の微調整のための BERT チェックポイント。
+
+```python
+>>> # leverage checkpoints for Bert2Bert model...
+>>> # use BERT's cls token as BOS token and sep token as EOS token
+>>> encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
+>>> # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+>>> decoder = BertGenerationDecoder.from_pretrained(
+...     "bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102
+... )
+>>> bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+
+>>> # create tokenizer...
+>>> tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+
+>>> input_ids = tokenizer(
+...     "This is a long article to summarize", add_special_tokens=False, return_tensors="pt"
+... ).input_ids
+>>> labels = tokenizer("This is a short summary", return_tensors="pt").input_ids
+
+>>> # train...
+>>> loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
+>>> loss.backward()
+```
+
+- 事前トレーニングされた [`EncoderDecoderModel`] もモデル ハブで直接利用できます。
+
+```python
+>>> # instantiate sentence fusion model
+>>> sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+
+>>> input_ids = tokenizer(
+...     "This is the first sentence. This is the second sentence.", add_special_tokens=False, return_tensors="pt"
+... ).input_ids
+
+>>> outputs = sentence_fuser.generate(input_ids)
+
+>>> print(tokenizer.decode(outputs[0]))
+```
+
+チップ：
+
+- [`BertGenerationEncoder`] と [`BertGenerationDecoder`] は、
+  [`EncoderDecoder`] と組み合わせます。
+- 要約、文の分割、文の融合、および翻訳の場合、入力に特別なトークンは必要ありません。
+  したがって、入力の末尾に EOS トークンを追加しないでください。
+
+このモデルは、[patrickvonplaten](https://huggingface.co/patrickvonplaten) によって提供されました。元のコードは次のとおりです
+[ここ](https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder) があります。
+
+## BertGenerationConfig
+
+[[autodoc]] BertGenerationConfig
+
+## BertGenerationTokenizer
+
+[[autodoc]] BertGenerationTokenizer
+    - save_vocabulary
+
+## BertGenerationEncoder
+
+[[autodoc]] BertGenerationEncoder
+    - forward
+
+## BertGenerationDecoder
+
+[[autodoc]] BertGenerationDecoder
+    - forward
diff --git a/docs/source/ja/model_doc/bert-japanese.md b/docs/source/ja/model_doc/bert-japanese.md
new file mode 100644
index 00000000000000..86cce741aac6cb
--- /dev/null
+++ b/docs/source/ja/model_doc/bert-japanese.md
@@ -0,0 +1,81 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BertJapanese
+
+## Overview
+
+BERT モデルは日本語テキストでトレーニングされました。
+
+2 つの異なるトークン化方法を備えたモデルがあります。
+
+- MeCab と WordPiece を使用してトークン化します。これには、[MeCab](https://taku910.github.io/mecab/) のラッパーである [fugashi](https://github.com/polm/fugashi) という追加の依存関係が必要です。
+- 文字にトークン化します。
+
+*MecabTokenizer* を使用するには、`pip installTransformers["ja"]` (または、インストールする場合は `pip install -e .["ja"]`) する必要があります。
+ソースから）依存関係をインストールします。
+
+[cl-tohakuリポジトリの詳細](https://github.com/cl-tohaku/bert-japanese)を参照してください。
+
+MeCab および WordPiece トークン化でモデルを使用する例:
+
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾輩 は 猫 で ある 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+文字トークン化を使用したモデルの使用例:
+
+```python
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+<Tip>
+
+- この実装はトークン化方法を除いて BERT と同じです。その他の使用例については、[BERT のドキュメント](bert) を参照してください。
+
+</Tip>
+
+このモデルは[cl-tohaku](https://huggingface.co/cl-tohaku)から提供されました。
+
+## BertJapaneseTokenizer
+
+[[autodoc]] BertJapaneseTokenizer
diff --git a/docs/source/ja/model_doc/bert.md b/docs/source/ja/model_doc/bert.md
new file mode 100644
index 00000000000000..2f65ee49d3376c
--- /dev/null
+++ b/docs/source/ja/model_doc/bert.md
@@ -0,0 +1,312 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=bert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/bert-base-uncased">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+BERT モデルは、Jacob Devlin、Ming-Wei Chang、Kenton Lee、Kristina Toutanova によって [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) で提案されました。それは
+マスクされた言語モデリング目標と次の文の組み合わせを使用して事前トレーニングされた双方向トランスフォーマー
+Toronto Book Corpus と Wikipedia からなる大規模なコーパスでの予測。
+
+論文の要約は次のとおりです。
+
+*BERT と呼ばれる新しい言語表現モデルを導入します。これは Bidirectional Encoder Representations の略です
+トランスフォーマーより。最近の言語表現モデルとは異なり、BERT は深い双方向性を事前にトレーニングするように設計されています。
+すべてのレイヤーの左と右の両方のコンテキストを共同で条件付けすることにより、ラベルのないテキストから表現します。結果として、
+事前トレーニングされた BERT モデルは、出力層を 1 つ追加するだけで微調整して、最先端のモデルを作成できます。
+実質的なタスク固有のものを必要とせず、質問応答や言語推論などの幅広いタスクに対応
+アーキテクチャの変更。*
+
+*BERT は概念的にはシンプルですが、経験的に強力です。 11 の自然な要素に関する新しい最先端の結果が得られます。
+言語処理タスク（GLUE スコアを 80.5% に押し上げる（7.7% ポイントの絶対改善）、MultiNLI を含む）
+精度は 86.7% (絶対値 4.6% 向上)、SQuAD v1.1 質問応答テスト F1 は 93.2 (絶対値 1.5 ポイント)
+改善) および SQuAD v2.0 テスト F1 から 83.1 (5.1 ポイントの絶対改善)。*
+
+## Usage tips
+
+- BERT は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
+  左。
+- BERT は、マスク言語モデリング (MLM) および次の文予測 (NSP) の目標を使用してトレーニングされました。それは
+  マスクされたトークンの予測や NLU では一般に効率的ですが、テキスト生成には最適ではありません。
+- ランダム マスキングを使用して入力を破壊します。より正確には、事前トレーニング中に、トークンの指定された割合 (通常は 15%) が次によってマスクされます。
+
+    * 確率0.8の特別なマスクトークン
+    * 確率 0.1 でマスクされたトークンとは異なるランダムなトークン
+    * 確率 0.1 の同じトークン
+    
+- モデルは元の文を予測する必要がありますが、2 番目の目的があります。入力は 2 つの文 A と B (間に分離トークンあり) です。確率 50% では、文はコーパス内で連続していますが、残りの 50% では関連性がありません。モデルは、文が連続しているかどうかを予測する必要があります。
+
+
+
+このモデルは [thomwolf](https://huggingface.co/thomwolf) によって提供されました。元のコードは [こちら](https://github.com/google-research/bert) にあります。
+
+## Resources
+
+BERT を始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+<PipelineTag pipeline="text-classification"/>
+
+- に関するブログ投稿 [別の言語での BERT テキスト分類](https://www.philschmid.de/bert-text-classification-in-a-different-language)。
+- [マルチラベル テキスト分類のための BERT (およびその友人) の微調整](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/BERT/Fine_tuning_BERT_(and_friends)_for_multi_label_text_classification.ipynb) のノートブック.
+-  方法に関するノートブック [PyTorch を使用したマルチラベル分類のための BERT の微調整](https://colab.research.google.com/github/abhmishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)。 
+- 方法に関するノートブック [要約のために BERT を使用して EncoderDecoder モデルをウォームスタートする](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)。
+- [`BertForSequenceClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)。
+- [`TFBertForSequenceClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)。
+- [`FlaxBertForSequenceClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb)。
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition](https://www.philschmid.de/huggingface-transformers-keras-tf) の使用方法に関するブログ投稿。
+- 各単語の最初の単語部分のみを使用した [固有表現認識のための BERT の微調整](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/Custom_Named_Entity_Recognition_with_BERT_only_first_wordpiece.ipynb) のノートブックトークン化中の単語ラベル内。単語のラベルをすべての単語部分に伝播するには、代わりにノートブックのこの [バージョン](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/BERT/Custom_Named_Entity_Recognition_with_BERT.ipynb) を参照してください。
+- [`BertForTokenClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)。
+- [`TFBertForTokenClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)。
+- [`FlaxBertForTokenClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification) によってサポートされています。
+- [トークン分類](https://huggingface.co/course/chapter7/2?fw=pt) 🤗 ハグフェイスコースの章。
+- [トークン分類タスクガイド](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- [`BertForMaskedLM`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling) でサポートされており、 [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)。
+- [`TFBertForMaskedLM`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/lang-modeling#run_mlmpy) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)。
+- [`FlaxBertForMaskedLM`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling) および [ノートブック]( https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)。
+- [マスクされた言語モデリング](https://huggingface.co/course/chapter7/3?fw=pt) 🤗 顔ハグ コースの章。
+- [マスクされた言語モデリング タスク ガイド](../tasks/masked_lang_modeling)
+
+
+<PipelineTag pipeline="question-answering"/>
+
+- [`BertForQuestionAnswering`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)。
+- [`TFBertForQuestionAnswering`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)。
+- [`FlaxBertForQuestionAnswering`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering) でサポートされています。
+- [質問回答](https://huggingface.co/course/chapter7/7?fw=pt) 🤗 ハグフェイスコースの章。
+- [質問回答タスク ガイド](../tasks/question_answering)
+
+**複数の選択肢**
+- [`BertForMultipleChoice`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)。
+- [`TFBertForMultipleChoice`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)。
+- [多肢選択タスク ガイド](../tasks/multiple_choice)
+
+⚡️ **推論**
+- 方法に関するブログ投稿  [Hugging Face Transformers と AWS Inferentia を使用して BERT 推論を高速化する](https://huggingface.co/blog/bert-inferentia-sagemaker)。
+- 方法に関するブログ投稿 [GPU 上の DeepSpeed-Inference を使用して BERT 推論を高速化する](https://www.philschmid.de/bert-deepspeed-inference)。
+
+⚙️ **事前トレーニング**
+- [Hugging Face Transformers と Habana Gaudi を使用した BERT の事前トレーニング] に関するブログ投稿 (https://www.philschmid.de/pre-training-bert-habana)。
+
+🚀 **デプロイ**
+- 方法に関するブログ投稿  [ハグフェイス最適化でトランスフォーマーを ONNX に変換する](https://www.philschmid.de/convert-transformers-to-onnx)。
+- 方法に関するブログ投稿 [AWS 上の Habana Gaudi を使用したハグ顔トランスフォーマーのための深層学習環境のセットアップ](https://www.philschmid.de/getting-started-habana-gaudi#conclusion)。
+- に関するブログ投稿  [Hugging Face Transformers、Amazon SageMaker、および Terraform モジュールを使用した自動スケーリング BERT](https://www.philschmid.de/terraform-huggingface-amazon-sagemaker-advanced)。
+- に関するブログ投稿  [HuggingFace、AWS Lambda、Docker を使用したサーバーレス BERT](https://www.philschmid.de/serverless-bert-with-huggingface-aws-lambda-docker)。
+- に関するブログ投稿 [Amazon SageMaker と Training Compiler を使用した Hugging Face Transformers BERT 微調整](https://www.philschmid.de/huggingface-amazon-sagemaker-training-compiler)。
+- に関するブログ投稿  [Transformers と Amazon SageMaker を使用した BERT のタスク固有の知識の蒸留](https://www.philschmid.de/knowledge-distillation-bert-transformers)
+
+## BertConfig
+
+[[autodoc]] BertConfig
+    - all
+
+## BertTokenizer
+
+[[autodoc]] BertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+<frameworkcontent>
+<pt>
+
+## BertTokenizerFast
+
+[[autodoc]] BertTokenizerFast
+
+</pt>
+<tf>
+
+## TFBertTokenizer
+
+[[autodoc]] TFBertTokenizer
+
+</tf>
+</frameworkcontent>
+
+## Bert specific outputs
+
+[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+
+<frameworkcontent>
+<pt>
+
+## BertModel
+
+[[autodoc]] BertModel
+    - forward
+
+## BertForPreTraining
+
+[[autodoc]] BertForPreTraining
+    - forward
+
+## BertLMHeadModel
+
+[[autodoc]] BertLMHeadModel
+    - forward
+
+## BertForMaskedLM
+
+[[autodoc]] BertForMaskedLM
+    - forward
+
+## BertForNextSentencePrediction
+
+[[autodoc]] BertForNextSentencePrediction
+    - forward
+
+## BertForSequenceClassification
+
+[[autodoc]] BertForSequenceClassification
+    - forward
+
+## BertForMultipleChoice
+
+[[autodoc]] BertForMultipleChoice
+    - forward
+
+## BertForTokenClassification
+
+[[autodoc]] BertForTokenClassification
+    - forward
+
+## BertForQuestionAnswering
+
+[[autodoc]] BertForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFBertModel
+
+[[autodoc]] TFBertModel
+    - call
+
+## TFBertForPreTraining
+
+[[autodoc]] TFBertForPreTraining
+    - call
+
+## TFBertModelLMHeadModel
+
+[[autodoc]] TFBertLMHeadModel
+    - call
+
+## TFBertForMaskedLM
+
+[[autodoc]] TFBertForMaskedLM
+    - call
+
+## TFBertForNextSentencePrediction
+
+[[autodoc]] TFBertForNextSentencePrediction
+    - call
+
+## TFBertForSequenceClassification
+
+[[autodoc]] TFBertForSequenceClassification
+    - call
+
+## TFBertForMultipleChoice
+
+[[autodoc]] TFBertForMultipleChoice
+    - call
+
+## TFBertForTokenClassification
+
+[[autodoc]] TFBertForTokenClassification
+    - call
+
+## TFBertForQuestionAnswering
+
+[[autodoc]] TFBertForQuestionAnswering
+    - call
+
+</tf>
+<jax>
+
+
+## FlaxBertModel
+
+[[autodoc]] FlaxBertModel
+    - __call__
+
+## FlaxBertForPreTraining
+
+[[autodoc]] FlaxBertForPreTraining
+    - __call__
+
+## FlaxBertForCausalLM
+
+[[autodoc]] FlaxBertForCausalLM
+    - __call__
+
+## FlaxBertForMaskedLM
+
+[[autodoc]] FlaxBertForMaskedLM
+    - __call__
+
+## FlaxBertForNextSentencePrediction
+
+[[autodoc]] FlaxBertForNextSentencePrediction
+    - __call__
+
+## FlaxBertForSequenceClassification
+
+[[autodoc]] FlaxBertForSequenceClassification
+    - __call__
+
+## FlaxBertForMultipleChoice
+
+[[autodoc]] FlaxBertForMultipleChoice
+    - __call__
+
+## FlaxBertForTokenClassification
+
+[[autodoc]] FlaxBertForTokenClassification
+    - __call__
+
+## FlaxBertForQuestionAnswering
+
+[[autodoc]] FlaxBertForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/bertweet.md b/docs/source/ja/model_doc/bertweet.md
new file mode 100644
index 00000000000000..3a5dddbf04cc58
--- /dev/null
+++ b/docs/source/ja/model_doc/bertweet.md
@@ -0,0 +1,68 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BERTweet
+
+## Overview
+
+BERTweet モデルは、Dat Quoc Nguyen、Thanh Vu によって [BERTweet: A pre-trained language model for English Tweets](https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf) で提案されました。アン・トゥアン・グエンさん。
+
+論文の要約は次のとおりです。
+
+*私たちは、英語ツイート用に初めて公開された大規模な事前トレーニング済み言語モデルである BERTweet を紹介します。私たちのBERTweetは、
+BERT ベースと同じアーキテクチャ (Devlin et al., 2019) は、RoBERTa 事前トレーニング手順 (Liu et al.) を使用してトレーニングされます。
+al.、2019）。実験では、BERTweet が強力なベースラインである RoBERTa ベースおよび XLM-R ベースを上回るパフォーマンスを示すことが示されています (Conneau et al.,
+2020)、3 つのツイート NLP タスクにおいて、以前の最先端モデルよりも優れたパフォーマンス結果が得られました。
+品詞タグ付け、固有表現認識およびテキスト分類。*
+
+## Usage example
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+>>> # For transformers v4.x+:
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+>>> # For transformers v3.x:
+>>> # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+>>> # INPUT TWEET IS ALREADY NORMALIZED!
+>>> line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+>>> input_ids = torch.tensor([tokenizer.encode(line)])
+
+>>> with torch.no_grad():
+...     features = bertweet(input_ids)  # Models outputs are now tuples
+
+>>> # With TensorFlow 2.0+:
+>>> # from transformers import TFAutoModel
+>>> # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+```
+<Tip>
+
+この実装は、トークン化方法を除いて BERT と同じです。詳細については、[BERT ドキュメント](bert) を参照してください。
+API リファレンス情報。
+
+</Tip>
+
+このモデルは [dqnguyen](https://huggingface.co/dqnguyen) によって提供されました。元のコードは [ここ](https://github.com/VinAIResearch/BERTweet) にあります。
+
+## BertweetTokenizer
+
+[[autodoc]] BertweetTokenizer
diff --git a/docs/source/ja/model_doc/big_bird.md b/docs/source/ja/model_doc/big_bird.md
new file mode 100644
index 00000000000000..4c0dabbebb46d4
--- /dev/null
+++ b/docs/source/ja/model_doc/big_bird.md
@@ -0,0 +1,176 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BigBird
+
+## Overview
+
+BigBird モデルは、[Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) で提案されました。
+ザヒール、マンジルとグルガネシュ、グルとダベイ、クマール・アヴィナヴァとエインズリー、ジョシュアとアルベルティ、クリスとオンタノン、
+サンティアゴとファム、フィリップとラブラ、アニルードとワン、キーファンとヤン、リーなど。 BigBird は注目度が低い
+BERT などの Transformer ベースのモデルをさらに長いシーケンスに拡張する、Transformer ベースのモデル。まばらに加えて
+アテンションと同様に、BigBird は入力シーケンスにランダム アテンションだけでなくグローバル アテンションも適用します。理論的には、
+まばらで全体的でランダムな注意を適用すると、完全な注意に近づくことが示されていますが、
+長いシーケンスでは計算効率が大幅に向上します。より長いコンテキストを処理できる機能の結果として、
+BigBird は、質問応答や
+BERT または RoBERTa と比較した要約。
+
+論文の要約は次のとおりです。
+
+*BERT などのトランスフォーマーベースのモデルは、NLP で最も成功した深層学習モデルの 1 つです。
+残念ながら、それらの中核的な制限の 1 つは、シーケンスに対する二次依存性 (主にメモリに関する) です。
+完全な注意メカニズムによる長さです。これを解決するために、BigBird は、まばらな注意メカニズムを提案します。
+この二次依存関係を線形に削減します。 BigBird がシーケンス関数の汎用近似器であることを示します。
+チューリングは完全であるため、二次完全注意モデルのこれらの特性が保存されます。途中、私たちの
+理論分析により、O(1) 個のグローバル トークン (CLS など) を持つ利点の一部が明らかになり、
+スパース注意メカニズムの一部としてのシーケンス。提案されたスパース アテンションは、次の長さのシーケンスを処理できます。
+同様のハードウェアを使用して以前に可能であったものの 8 倍。より長いコンテキストを処理できる機能の結果として、
+BigBird は、質問応答や要約などのさまざまな NLP タスクのパフォーマンスを大幅に向上させます。私達も
+ゲノミクスデータへの新しいアプリケーションを提案します。*
+
+チップ：
+
+- BigBird の注意がどのように機能するかについての詳細な説明については、[このブログ投稿](https://huggingface.co/blog/big-bird) を参照してください。
+- BigBird には、**original_full** と **block_sparse** の 2 つの実装が付属しています。シーケンス長が 1024 未満の場合、次を使用します。
+  **block_sparse** を使用してもメリットがないため、**original_full** を使用することをお勧めします。
+- コードは現在、3 ブロックと 2 グローバル ブロックのウィンドウ サイズを使用しています。
+- シーケンスの長さはブロック サイズで割り切れる必要があります。
+- 現在の実装では **ITC** のみがサポートされています。
+- 現在の実装では **num_random_blocks = 0** はサポートされていません
+- BigBird は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
+  左。
+
+  このモデルは、[vasudevgupta](https://huggingface.co/vasudevgupta) によって提供されました。元のコードが見つかる
+[こちら](https://github.com/google-research/bigbird)。
+
+## ドキュメント リソース
+
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [トークン分類タスクガイド](../tasks/token_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [マスクされた言語モデリング タスク ガイド](../tasks/masked_lang_modeling)
+- [多肢選択タスク ガイド](../tasks/multiple_choice)
+
+## BigBirdConfig
+
+[[autodoc]] BigBirdConfig
+
+## BigBirdTokenizer
+
+[[autodoc]] BigBirdTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BigBirdTokenizerFast
+
+[[autodoc]] BigBirdTokenizerFast
+
+## BigBird specific outputs
+
+[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput
+
+<frameworkcontent>
+<pt>
+
+## BigBirdModel
+
+[[autodoc]] BigBirdModel
+    - forward
+
+## BigBirdForPreTraining
+
+[[autodoc]] BigBirdForPreTraining
+    - forward
+
+## BigBirdForCausalLM
+
+[[autodoc]] BigBirdForCausalLM
+    - forward
+
+## BigBirdForMaskedLM
+
+[[autodoc]] BigBirdForMaskedLM
+    - forward
+
+## BigBirdForSequenceClassification
+
+[[autodoc]] BigBirdForSequenceClassification
+    - forward
+
+## BigBirdForMultipleChoice
+
+[[autodoc]] BigBirdForMultipleChoice
+    - forward
+
+## BigBirdForTokenClassification
+
+[[autodoc]] BigBirdForTokenClassification
+    - forward
+
+## BigBirdForQuestionAnswering
+
+[[autodoc]] BigBirdForQuestionAnswering
+    - forward
+
+</pt>
+<jax>
+
+## FlaxBigBirdModel
+
+[[autodoc]] FlaxBigBirdModel
+    - __call__
+
+## FlaxBigBirdForPreTraining
+
+[[autodoc]] FlaxBigBirdForPreTraining
+    - __call__
+
+## FlaxBigBirdForCausalLM
+
+[[autodoc]] FlaxBigBirdForCausalLM
+    - __call__
+
+## FlaxBigBirdForMaskedLM
+
+[[autodoc]] FlaxBigBirdForMaskedLM
+    - __call__
+
+## FlaxBigBirdForSequenceClassification
+
+[[autodoc]] FlaxBigBirdForSequenceClassification
+    - __call__
+
+## FlaxBigBirdForMultipleChoice
+
+[[autodoc]] FlaxBigBirdForMultipleChoice
+    - __call__
+
+## FlaxBigBirdForTokenClassification
+
+[[autodoc]] FlaxBigBirdForTokenClassification
+    - __call__
+
+## FlaxBigBirdForQuestionAnswering
+
+[[autodoc]] FlaxBigBirdForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
+
diff --git a/docs/source/ja/model_doc/bigbird_pegasus.md b/docs/source/ja/model_doc/bigbird_pegasus.md
new file mode 100644
index 00000000000000..e0132b4b5f86b5
--- /dev/null
+++ b/docs/source/ja/model_doc/bigbird_pegasus.md
@@ -0,0 +1,95 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BigBirdPegasus
+
+## Overview
+
+BigBird モデルは、[Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) で提案されました。
+ザヒール、マンジルとグルガネシュ、グルとダベイ、クマール・アヴィナヴァとエインズリー、ジョシュアとアルベルティ、クリスとオンタノン、
+サンティアゴとファム、フィリップとラブラ、アニルードとワン、キーファンとヤン、リーなど。 BigBird は注目度が低い
+BERT などの Transformer ベースのモデルをさらに長いシーケンスに拡張する、Transformer ベースのモデル。まばらに加えて
+アテンションと同様に、BigBird は入力シーケンスにランダム アテンションだけでなくグローバル アテンションも適用します。理論的には、
+まばらで全体的でランダムな注意を適用すると、完全な注意に近づくことが示されていますが、
+長いシーケンスでは計算効率が大幅に向上します。より長いコンテキストを処理できる機能の結果として、
+BigBird は、質問応答や
+BERT または RoBERTa と比較した要約。
+
+論文の要約は次のとおりです。
+
+*BERT などのトランスフォーマーベースのモデルは、NLP で最も成功した深層学習モデルの 1 つです。
+残念ながら、それらの中核的な制限の 1 つは、シーケンスに対する二次依存性 (主にメモリに関する) です。
+完全な注意メカニズムによる長さです。これを解決するために、BigBird は、まばらな注意メカニズムを提案します。
+この二次依存関係を線形に削減します。 BigBird がシーケンス関数の汎用近似器であることを示します。
+チューリングは完全であるため、二次完全注意モデルのこれらの特性が保存されます。途中、私たちの
+理論分析により、O(1) 個のグローバル トークン (CLS など) を持つ利点の一部が明らかになり、
+スパース注意メカニズムの一部としてのシーケンス。提案されたスパース アテンションは、次の長さのシーケンスを処理できます。
+同様のハードウェアを使用して以前に可能であったものの 8 倍。より長いコンテキストを処理できる機能の結果として、
+BigBird は、質問応答や要約などのさまざまな NLP タスクのパフォーマンスを大幅に向上させます。私達も
+ゲノミクスデータへの新しいアプリケーションを提案します。*
+
+## Usage tips
+
+- BigBird の注意がどのように機能するかについての詳細な説明については、[このブログ投稿](https://huggingface.co/blog/big-bird) を参照してください。
+- BigBird には、**original_full** と **block_sparse** の 2 つの実装が付属しています。シーケンス長が 1024 未満の場合、次を使用します。
+  **block_sparse** を使用してもメリットがないため、**original_full** を使用することをお勧めします。
+- コードは現在、3 ブロックと 2 グローバル ブロックのウィンドウ サイズを使用しています。
+- シーケンスの長さはブロック サイズで割り切れる必要があります。
+- 現在の実装では **ITC** のみがサポートされています。
+- 現在の実装では **num_random_blocks = 0** はサポートされていません。
+- BigBirdPegasus は [PegasusTokenizer](https://github.com/huggingface/transformers/blob/main/src/transformers/models/pegasus/tokenization_pegasus.py) を使用します。
+- BigBird は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
+  左。
+
+元のコードは [こちら](https://github.com/google-research/bigbird) にあります。
+
+## ドキュメント リソース
+
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [翻訳タスクガイド](../tasks/translation)
+- [要約タスクガイド](../tasks/summarization)
+
+## BigBirdPegasusConfig
+
+[[autodoc]] BigBirdPegasusConfig
+    - all
+
+## BigBirdPegasusModel
+
+[[autodoc]] BigBirdPegasusModel
+    - forward
+
+## BigBirdPegasusForConditionalGeneration
+
+[[autodoc]] BigBirdPegasusForConditionalGeneration
+    - forward
+
+## BigBirdPegasusForSequenceClassification
+
+[[autodoc]] BigBirdPegasusForSequenceClassification
+    - forward
+
+## BigBirdPegasusForQuestionAnswering
+
+[[autodoc]] BigBirdPegasusForQuestionAnswering
+    - forward
+
+## BigBirdPegasusForCausalLM
+
+[[autodoc]] BigBirdPegasusForCausalLM
+    - forward
diff --git a/docs/source/ja/model_doc/biogpt.md b/docs/source/ja/model_doc/biogpt.md
new file mode 100644
index 00000000000000..0634062a6b7214
--- /dev/null
+++ b/docs/source/ja/model_doc/biogpt.md
@@ -0,0 +1,73 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BioGPT
+
+## Overview
+
+BioGPT モデルは、[BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo、Liai Sun、Yingce Xia、 Tao Qin、Sheng Zhang、Hoifung Poon、Tie-Yan Liu。 BioGPT は、生物医学テキストの生成とマイニングのための、ドメイン固有の生成事前トレーニング済み Transformer 言語モデルです。 BioGPT は、Transformer 言語モデルのバックボーンに従い、1,500 万の PubMed 抄録で最初から事前トレーニングされています。
+
+論文の要約は次のとおりです。
+
+*事前トレーニング済み言語モデルは、一般的な自然言語領域での大きな成功に触発されて、生物医学領域でますます注目を集めています。一般言語ドメインの事前トレーニング済み言語モデルの 2 つの主なブランチ、つまり BERT (およびそのバリアント) と GPT (およびそのバリアント) のうち、1 つ目は BioBERT や PubMedBERT などの生物医学ドメインで広く研究されています。これらはさまざまな下流の生物医学的タスクで大きな成功を収めていますが、生成能力の欠如により応用範囲が制限されています。この論文では、大規模な生物医学文献で事前トレーニングされたドメイン固有の生成 Transformer 言語モデルである BioGPT を提案します。私たちは 6 つの生物医学的自然言語処理タスクで BioGPT を評価し、ほとんどのタスクで私たちのモデルが以前のモデルよりも優れていることを実証しました。特に、BC5CDR、KD-DTI、DDI のエンドツーエンド関係抽出タスクではそれぞれ 44.98%、38.42%、40.76% の F1 スコアを獲得し、PubMedQA では 78.2% の精度を獲得し、新記録を樹立しました。テキスト生成に関する私たちのケーススタディは、生物医学文献における BioGPT の利点をさらに実証し、生物医学用語の流暢な説明を生成します。*
+
+## Usage tips
+
+- BioGPT は絶対位置埋め込みを備えたモデルであるため、通常は入力を左側ではなく右側にパディングすることをお勧めします。
+- BioGPT は因果言語モデリング (CLM) 目的でトレーニングされているため、シーケンス内の次のトークンを予測するのに強力です。 run_generation.py サンプル スクリプトで確認できるように、この機能を利用すると、BioGPT は構文的に一貫したテキストを生成できます。
+- モデルは、以前に計算されたキーと値のアテンション ペアである`past_key_values`(PyTorch の場合) を入力として受け取ることができます。この (past_key_values または past) 値を使用すると、モデルがテキスト生成のコンテキストで事前に計算された値を再計算できなくなります。 PyTorch の使用法の詳細については、BioGptForCausalLM.forward() メソッドの past_key_values 引数を参照してください。
+
+このモデルは、[kamalkraj](https://huggingface.co/kamalkraj) によって提供されました。元のコードは [ここ](https://github.com/microsoft/BioGPT) にあります。
+
+## Documentation resources
+
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+
+## BioGptConfig
+
+[[autodoc]] BioGptConfig
+
+
+## BioGptTokenizer
+
+[[autodoc]] BioGptTokenizer
+    - save_vocabulary
+
+
+## BioGptModel
+
+[[autodoc]] BioGptModel
+    - forward
+
+
+## BioGptForCausalLM
+
+[[autodoc]] BioGptForCausalLM
+    - forward
+
+    
+## BioGptForTokenClassification
+
+[[autodoc]] BioGptForTokenClassification
+    - forward
+
+
+## BioGptForSequenceClassification
+
+[[autodoc]] BioGptForSequenceClassification
+    - forward
+
+    
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/bit.md b/docs/source/ja/model_doc/bit.md
new file mode 100644
index 00000000000000..76b24fa64470a2
--- /dev/null
+++ b/docs/source/ja/model_doc/bit.md
@@ -0,0 +1,65 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Big Transfer (BiT)
+
+## Overview
+
+BiT モデルは、Alexander Kolesnikov、Lucas Beyer、Xiaohua Zhai、Joan Puigcerver、Jessica Yung、Sylvain Gelly によって [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) で提案されました。ニール・ホールズビー。
+BiT は、[ResNet](resnet) のようなアーキテクチャ (具体的には ResNetv2) の事前トレーニングをスケールアップするための簡単なレシピです。この方法により、転移学習が大幅に改善されます。
+
+論文の要約は次のとおりです。
+
+*事前トレーニングされた表現の転送により、サンプル効率が向上し、視覚用のディープ ニューラル ネットワークをトレーニングする際のハイパーパラメーター調整が簡素化されます。大規模な教師ありデータセットでの事前トレーニングと、ターゲット タスクでのモデルの微調整のパラダイムを再検討します。私たちは事前トレーニングをスケールアップし、Big Transfer (BiT) と呼ぶシンプルなレシピを提案します。いくつかの慎重に選択されたコンポーネントを組み合わせ、シンプルなヒューリスティックを使用して転送することにより、20 を超えるデータセットで優れたパフォーマンスを実現します。 BiT は、クラスごとに 1 つのサンプルから合計 100 万のサンプルまで、驚くほど広範囲のデータ領域にわたって良好にパフォーマンスを発揮します。 BiT は、ILSVRC-2012 で 87.5%、CIFAR-10 で 99.4%、19 タスクの Visual Task Adaptation Benchmark (VTAB) で 76.3% のトップ 1 精度を達成しました。小規模なデータセットでは、BiT は ILSVRC-2012 (クラスあたり 10 例) で 76.8%、CIFAR-10 (クラスあたり 10 例) で 97.0% を達成しました。高い転写性能を実現する主要成分を詳細に分析※。
+
+## Usage tips
+
+- BiT モデルは、アーキテクチャの点で ResNetv2 と同等ですが、次の点が異なります: 1) すべてのバッチ正規化層が [グループ正規化](https://arxiv.org/abs/1803.08494) に置き換えられます。
+2) [重みの標準化](https://arxiv.org/abs/1903.10520) は畳み込み層に使用されます。著者らは、両方の組み合わせが大きなバッチサイズでのトレーニングに役立ち、重要な効果があることを示しています。
+転移学習への影響。
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。
+元のコードは [こちら](https://github.com/google-research/big_transfer) にあります。
+
+## Resources
+
+BiT を始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`BitForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
+- 参照: [画像分類タスク ガイド](../tasks/image_classification)
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## BitConfig
+
+[[autodoc]] BitConfig
+
+## BitImageProcessor
+
+[[autodoc]] BitImageProcessor
+    - preprocess
+
+## BitModel
+
+[[autodoc]] BitModel
+    - forward
+
+## BitForImageClassification
+
+[[autodoc]] BitForImageClassification
+    - forward
diff --git a/docs/source/ja/model_doc/blenderbot-small.md b/docs/source/ja/model_doc/blenderbot-small.md
new file mode 100644
index 00000000000000..ecb9c1174b285b
--- /dev/null
+++ b/docs/source/ja/model_doc/blenderbot-small.md
@@ -0,0 +1,110 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Blenderbot Small
+
+[`BlenderbotSmallModel`] と
+[`BlenderbotSmallForConditionalGeneration`] はチェックポイントと組み合わせてのみ使用されます
+[facebook/blenderbot-90M](https://huggingface.co/facebook/blenderbot-90M)。より大規模な Blenderbot チェックポイントは、
+代わりに [`BlenderbotModel`] とともに使用してください。
+[`BlenderbotForConditionalGeneration`]
+
+## Overview
+
+Blender チャットボット モデルは、[Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller、Emily Dinan、Naman Goyal、Da Ju、Mary Williamson、yinghan Liu、で提案されました。
+ジン・シュー、マイル・オット、カート・シャスター、エリック・M・スミス、Y-ラン・ブーロー、ジェイソン・ウェストン、2020年4月30日。
+
+論文の要旨は次のとおりです。
+
+*オープンドメインのチャットボットの構築は、機械学習研究にとって難しい分野です。これまでの研究では次のことが示されていますが、
+ニューラル モデルをパラメーターの数とトレーニング対象のデータのサイズでスケーリングすると、結果が向上します。
+高性能のチャットボットには他の要素も重要であることを示します。良い会話には多くのことが必要です
+会話の専門家がシームレスに融合するスキル: 魅力的な話のポイントを提供し、話を聞く
+一貫した態度を維持しながら、知識、共感、個性を適切に表現する
+ペルソナ。適切なトレーニング データと選択が与えられた場合、大規模モデルがこれらのスキルを学習できることを示します。
+世代戦略。 90M、2.7B、9.4B パラメーター モデルを使用してこれらのレシピのバリアントを構築し、モデルを作成します。
+コードは公開されています。人間による評価では、当社の最良のモデルが既存のアプローチよりも優れていることがマルチターンで示されています
+魅力と人間性の測定という観点からの対話。次に、分析によってこの作業の限界について説明します。
+弊社機種の故障事例*
+
+チップ：
+
+- Blenderbot Small は絶対位置埋め込みを備えたモデルなので、通常は入力を右側にパディングすることをお勧めします。
+  左。
+
+このモデルは、[patrickvonplaten](https://huggingface.co/patrickvonplaten) によって提供されました。著者のコードは次のとおりです
+[ここ](https://github.com/facebookresearch/ParlAI) をご覧ください。
+
+## Documentation resources
+
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [翻訳タスクガイド](../tasks/translation)
+- [要約タスクガイド](../tasks/summarization)
+
+## BlenderbotSmallConfig
+
+[[autodoc]] BlenderbotSmallConfig
+
+## BlenderbotSmallTokenizer
+
+[[autodoc]] BlenderbotSmallTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BlenderbotSmallTokenizerFast
+
+[[autodoc]] BlenderbotSmallTokenizerFast
+
+## BlenderbotSmallModel
+
+[[autodoc]] BlenderbotSmallModel
+    - forward
+
+## BlenderbotSmallForConditionalGeneration
+
+[[autodoc]] BlenderbotSmallForConditionalGeneration
+    - forward
+
+## BlenderbotSmallForCausalLM
+
+[[autodoc]] BlenderbotSmallForCausalLM
+    - forward
+
+## TFBlenderbotSmallModel
+
+[[autodoc]] TFBlenderbotSmallModel
+    - call
+
+## TFBlenderbotSmallForConditionalGeneration
+
+[[autodoc]] TFBlenderbotSmallForConditionalGeneration
+    - call
+
+## FlaxBlenderbotSmallModel
+
+[[autodoc]] FlaxBlenderbotSmallModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBlenderbotForConditionalGeneration
+
+[[autodoc]] FlaxBlenderbotSmallForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/ja/model_doc/blenderbot.md b/docs/source/ja/model_doc/blenderbot.md
new file mode 100644
index 00000000000000..f7ee23e7557d7b
--- /dev/null
+++ b/docs/source/ja/model_doc/blenderbot.md
@@ -0,0 +1,132 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Blenderbot
+
+**免責事項:** 何か奇妙なものを見つけた場合は、 [Github Issue](https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title) を報告してください。
+
+## Overview
+
+Blender チャットボット モデルは、[Recipes for building an open-domain chatbot](https://arxiv.org/pdf/2004.13637.pdf) Stephen Roller、Emily Dinan、Naman Goyal、Da Ju、Mary Williamson、yinghan Liu、で提案されました。
+ジン・シュー、マイル・オット、カート・シャスター、エリック・M・スミス、Y-ラン・ブーロー、ジェイソン・ウェストン、2020年4月30日。
+
+論文の要旨は次のとおりです。
+
+*オープンドメインのチャットボットの構築は、機械学習研究にとって難しい分野です。これまでの研究では次のことが示されていますが、
+ニューラル モデルをパラメーターの数とトレーニング対象のデータのサイズでスケーリングすると、結果が向上します。
+高性能のチャットボットには他の要素も重要であることを示します。良い会話には多くのことが必要です
+会話の専門家がシームレスに融合するスキル: 魅力的な話のポイントを提供し、話を聞く
+一貫した態度を維持しながら、知識、共感、個性を適切に表現する
+ペルソナ。適切なトレーニング データと選択が与えられた場合、大規模モデルがこれらのスキルを学習できることを示します。
+世代戦略。 90M、2.7B、9.4B パラメーター モデルを使用してこれらのレシピのバリアントを構築し、モデルを作成します。
+コードは公開されています。人間による評価では、当社の最良のモデルが既存のアプローチよりも優れていることがマルチターンで示されています
+魅力と人間性の測定という観点からの対話。次に、分析によってこの作業の限界について説明します。
+弊社機種の故障事例*
+
+チップ：
+
+- Blenderbot は絶対位置埋め込みを備えたモデルであるため、通常は入力を右側にパディングすることをお勧めします。
+  左。
+
+このモデルは [sshleifer](https://huggingface.co/sshleifer) によって提供されました。著者のコードは [ここ](https://github.com/facebookresearch/ParlAI) にあります。
+
+## Implementation Notes
+
+- Blenderbot は、標準の [seq2seq モデル トランスフォーマー](https://arxiv.org/pdf/1706.03762.pdf) ベースのアーキテクチャを使用します。
+- 利用可能なチェックポイントは、[モデル ハブ](https://huggingface.co/models?search=blenderbot) で見つけることができます。
+- これは *デフォルト* Blenderbot モデル クラスです。ただし、次のような小さなチェックポイントもいくつかあります。
+  `facebook/blenderbot_small_90M` はアーキテクチャが異なるため、一緒に使用する必要があります。
+  [BlenderbotSmall](ブレンダーボット小)。
+
+## Usage
+
+モデルの使用例を次に示します。
+
+```python
+>>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+
+>>> mname = "facebook/blenderbot-400M-distill"
+>>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+>>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+>>> UTTERANCE = "My friends are cool but they eat too many carbs."
+>>> inputs = tokenizer([UTTERANCE], return_tensors="pt")
+>>> reply_ids = model.generate(**inputs)
+>>> print(tokenizer.batch_decode(reply_ids))
+["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
+```
+
+## Documentation resources
+
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [翻訳タスクガイド](../tasks/translation)
+- [要約タスクガイド](../tasks/summarization)
+
+## BlenderbotConfig
+
+[[autodoc]] BlenderbotConfig
+
+## BlenderbotTokenizer
+
+[[autodoc]] BlenderbotTokenizer
+    - build_inputs_with_special_tokens
+
+## BlenderbotTokenizerFast
+
+[[autodoc]] BlenderbotTokenizerFast
+    - build_inputs_with_special_tokens
+
+## BlenderbotModel
+
+*forward* および *generate* の引数については、`transformers.BartModel`を参照してください。
+
+[[autodoc]] BlenderbotModel
+    - forward
+
+## BlenderbotForConditionalGeneration
+
+*forward* と *generate* の引数については、[`~transformers.BartForConditionalGeneration`] を参照してください。
+
+[[autodoc]] BlenderbotForConditionalGeneration
+    - forward
+
+## BlenderbotForCausalLM
+
+[[autodoc]] BlenderbotForCausalLM
+    - forward
+
+## TFBlenderbotModel
+
+[[autodoc]] TFBlenderbotModel
+    - call
+
+## TFBlenderbotForConditionalGeneration
+
+[[autodoc]] TFBlenderbotForConditionalGeneration
+    - call
+
+## FlaxBlenderbotModel
+
+[[autodoc]] FlaxBlenderbotModel
+    - __call__
+    - encode
+    - decode
+
+## FlaxBlenderbotForConditionalGeneration
+
+[[autodoc]] FlaxBlenderbotForConditionalGeneration
+    - __call__
+    - encode
+    - decode
diff --git a/docs/source/ja/model_doc/blip-2.md b/docs/source/ja/model_doc/blip-2.md
new file mode 100644
index 00000000000000..bd110522e27d60
--- /dev/null
+++ b/docs/source/ja/model_doc/blip-2.md
@@ -0,0 +1,90 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP-2
+
+## Overview
+
+BLIP-2 モデルは、[BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) で提案されました。
+Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.・サバレーゼ、スティーブン・ホイ。 BLIP-2 は、軽量の 12 層 Transformer をトレーニングすることで、フリーズされた事前トレーニング済み画像エンコーダーと大規模言語モデル (LLM) を活用します。
+それらの間にエンコーダーを配置し、さまざまな視覚言語タスクで最先端のパフォーマンスを実現します。最も注目すべき点は、BLIP-2 が 800 億パラメータ モデルである [Flamingo](https://arxiv.org/abs/2204.14198) を 8.7% 改善していることです。
+ゼロショット VQAv2 ではトレーニング可能なパラメーターが 54 分の 1 に減少します。
+
+論文の要約は次のとおりです。
+
+*大規模モデルのエンドツーエンドのトレーニングにより、視覚と言語の事前トレーニングのコストはますます法外なものになってきています。この論文では、市販の凍結済み事前トレーニング画像エンコーダと凍結された大規模言語モデルから視覚言語の事前トレーニングをブートストラップする、汎用的で効率的な事前トレーニング戦略である BLIP-2 を提案します。 BLIP-2 は、2 段階で事前トレーニングされた軽量の Querying Transformer でモダリティのギャップを橋渡しします。最初のステージでは、フリーズされた画像エンコーダーから学習する視覚言語表現をブートストラップします。第 2 段階では、凍結された言語モデルから視覚から言語への生成学習をブートストラップします。 BLIP-2 は、既存の方法よりもトレーニング可能なパラメーターが大幅に少ないにもかかわらず、さまざまな視覚言語タスクで最先端のパフォーマンスを実現します。たとえば、私たちのモデルは、トレーニング可能なパラメーターが 54 分の 1 少ないゼロショット VQAv2 で、Flamingo80B を 8.7% 上回っています。また、自然言語の命令に従うことができる、ゼロショット画像からテキストへの生成というモデルの新しい機能も実証します*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/blip2_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> BLIP-2 アーキテクチャ。 <a href="https://arxiv.org/abs/2301.12597">元の論文から抜粋。</a> </small>
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。
+元のコードは [ここ](https://github.com/salesforce/LAVIS/tree/5ee63d688ba4cebff63acee04adaef2dee9af207) にあります。
+
+## Usage tips
+
+- BLIP-2 は、画像とオプションのテキスト プロンプトを指定して条件付きテキストを生成するために使用できます。推論時には、 [`generate`] メソッドを使用することをお勧めします。
+- [`Blip2Processor`] を使用してモデル用の画像を準備し、予測されたトークン ID をデコードしてテキストに戻すことができます。
+
+## Resources
+
+BLIP-2 の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。
+
+- 画像キャプション、ビジュアル質問応答 (VQA)、およびチャットのような会話のための BLIP-2 のデモ ノートブックは、[こちら](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BLIP-2) にあります。
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## Blip2Config
+
+[[autodoc]] Blip2Config
+    - from_vision_qformer_text_configs
+
+## Blip2VisionConfig
+
+[[autodoc]] Blip2VisionConfig
+
+## Blip2QFormerConfig
+
+[[autodoc]] Blip2QFormerConfig
+
+## Blip2Processor
+
+[[autodoc]] Blip2Processor
+
+## Blip2VisionModel
+
+[[autodoc]] Blip2VisionModel
+    - forward
+
+## Blip2QFormerModel
+
+[[autodoc]] Blip2QFormerModel
+    - forward
+
+## Blip2Model
+
+[[autodoc]] Blip2Model
+    - forward
+    - get_text_features
+    - get_image_features
+    - get_qformer_features
+
+## Blip2ForConditionalGeneration
+
+[[autodoc]] Blip2ForConditionalGeneration
+    - forward
+    - generate
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/blip.md b/docs/source/ja/model_doc/blip.md
new file mode 100644
index 00000000000000..c145af701f23bb
--- /dev/null
+++ b/docs/source/ja/model_doc/blip.md
@@ -0,0 +1,134 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP
+
+## Overview
+
+BLIP モデルは、[BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) で Junnan Li、Dongxu Li、Caiming Xiong、Steven Hoi によって提案されました。 。
+
+BLIP は、次のようなさまざまなマルチモーダル タスクを実行できるモデルです。
+- 視覚的な質問応答
+- 画像とテキストの検索（画像とテキストのマッチング）
+- 画像キャプション
+
+論文の要約は次のとおりです。
+
+*視覚言語事前トレーニング (VLP) により、多くの視覚言語タスクのパフォーマンスが向上しました。
+ただし、既存の事前トレーニング済みモデルのほとんどは、理解ベースのタスクまたは世代ベースのタスクのいずれかでのみ優れています。さらに、最適ではない監視ソースである Web から収集されたノイズの多い画像とテキストのペアを使用してデータセットをスケールアップすることで、パフォーマンスの向上が大幅に達成されました。この論文では、視覚言語の理解と生成タスクの両方に柔軟に移行する新しい VLP フレームワークである BLIP を提案します。 BLIP は、キャプションをブートストラップすることでノイズの多い Web データを効果的に利用します。キャプショナーが合成キャプションを生成し、フィルターがノイズの多いキャプションを除去します。画像テキスト検索 (平均再現率 +2.7%@1)、画像キャプション作成 (CIDEr で +2.8%)、VQA ( VQA スコアは +1.6%)。 BLIP は、ゼロショット方式でビデオ言語タスクに直接転送した場合にも、強力な一般化能力を発揮します。コード、モデル、データセットがリリースされています。*
+
+![BLIP.gif](https://cdn-uploads.huggingface.co/production/uploads/1670928184033-62441d1d9fdefb55a0b7d12c.gif)
+
+このモデルは [ybelkada](https://huggingface.co/ybelkada) によって提供されました。
+元のコードは [ここ](https://github.com/salesforce/BLIP) にあります。
+
+## Resources
+
+- [Jupyter ノートブック](https://github.com/huggingface/notebooks/blob/main/examples/image_captioning_blip.ipynb) カスタム データセットの画像キャプション用に BLIP を微調整する方法
+
+## BlipConfig
+
+[[autodoc]] BlipConfig
+    - from_text_vision_configs
+
+## BlipTextConfig
+
+[[autodoc]] BlipTextConfig
+
+## BlipVisionConfig
+
+[[autodoc]] BlipVisionConfig
+
+## BlipProcessor
+
+[[autodoc]] BlipProcessor
+
+## BlipImageProcessor
+
+[[autodoc]] BlipImageProcessor
+    - preprocess
+
+<frameworkcontent>
+<pt>
+
+## BlipModel
+
+[[autodoc]] BlipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## BlipTextModel
+
+[[autodoc]] BlipTextModel
+    - forward
+
+## BlipVisionModel
+
+[[autodoc]] BlipVisionModel
+    - forward
+
+## BlipForConditionalGeneration
+
+[[autodoc]] BlipForConditionalGeneration
+    - forward
+
+## BlipForImageTextRetrieval
+
+[[autodoc]] BlipForImageTextRetrieval
+    - forward
+
+## BlipForQuestionAnswering
+
+[[autodoc]] BlipForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFBlipModel
+
+[[autodoc]] TFBlipModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFBlipTextModel
+
+[[autodoc]] TFBlipTextModel
+    - call
+
+## TFBlipVisionModel
+
+[[autodoc]] TFBlipVisionModel
+    - call
+
+## TFBlipForConditionalGeneration
+
+[[autodoc]] TFBlipForConditionalGeneration
+    - call
+
+## TFBlipForImageTextRetrieval
+
+[[autodoc]] TFBlipForImageTextRetrieval
+    - call
+
+## TFBlipForQuestionAnswering
+
+[[autodoc]] TFBlipForQuestionAnswering
+    - call
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/bloom.md b/docs/source/ja/model_doc/bloom.md
new file mode 100644
index 00000000000000..1ac9396fa8ab76
--- /dev/null
+++ b/docs/source/ja/model_doc/bloom.md
@@ -0,0 +1,107 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLOOM
+
+## Overview
+
+BLOOM モデルは、[BigScience Workshop](https://bigscience.huggingface.co/) を通じてさまざまなバージョンで提案されています。 BigScience は、研究者が時間とリソースをプールして共同でより高い効果を達成する他のオープン サイエンス イニシアチブからインスピレーションを得ています。
+BLOOM のアーキテクチャは基本的に GPT3 (次のトークン予測のための自己回帰モデル) に似ていますが、46 の異なる言語と 13 のプログラミング言語でトレーニングされています。
+モデルのいくつかの小さいバージョンが同じデータセットでトレーニングされています。 BLOOM は次のバージョンで利用できます。
+
+- [bloom-560m](https://huggingface.co/bigscience/bloom-560m)
+- [bloom-1b1](https://huggingface.co/bigscience/bloom-1b1)
+- [bloom-1b7](https://huggingface.co/bigscience/bloom-1b7)
+- [bloom-3b](https://huggingface.co/bigscience/bloom-3b)
+- [bloom-7b1](https://huggingface.co/bigscience/bloom-7b1)
+- [bloom](https://huggingface.co/bigscience/bloom) (176B parameters)
+
+## Resources
+
+BLOOM を使い始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+<PipelineTag pipeline="text-generation"/>
+
+- [`BloomForCausalLM`] これによってサポートされています [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
+
+以下も参照してください。
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [トークン分類タスクガイド](../tasks/token_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+
+
+⚡️ 推論
+-  に関するブログ  [最適化の話: ブルーム推論](https://huggingface.co/blog/bloom-inference-optimization)。
+- に関するブログ [DeepSpeed と Accelerate を使用した信じられないほど高速な BLOOM 推論](https://huggingface.co/blog/bloom-inference-pytorch-scripts)。
+
+⚙️トレーニング
+- に関するブログ [BLOOM トレーニングの背後にあるテクノロジー](https://huggingface.co/blog/bloom-megatron-deepspeed)。
+
+## BloomConfig
+
+[[autodoc]] BloomConfig
+    - all
+
+## BloomTokenizerFast
+
+[[autodoc]] BloomTokenizerFast
+    - all
+
+
+<frameworkcontent>
+<pt>
+
+## BloomModel
+
+[[autodoc]] BloomModel
+    - forward
+
+## BloomForCausalLM
+
+[[autodoc]] BloomForCausalLM
+    - forward
+
+## BloomForSequenceClassification
+
+[[autodoc]] BloomForSequenceClassification
+    - forward
+
+## BloomForTokenClassification
+
+[[autodoc]] BloomForTokenClassification
+    - forward
+
+## BloomForQuestionAnswering
+
+[[autodoc]] BloomForQuestionAnswering
+    - forward
+
+</pt>
+<jax>
+
+## FlaxBloomModel
+
+[[autodoc]] FlaxBloomModel
+    - __call__
+
+## FlaxBloomForCausalLM
+
+[[autodoc]] FlaxBloomForCausalLM
+    - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/ja/model_doc/bort.md b/docs/source/ja/model_doc/bort.md
new file mode 100644
index 00000000000000..2b892a35bb9cc3
--- /dev/null
+++ b/docs/source/ja/model_doc/bort.md
@@ -0,0 +1,55 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BORT
+
+<Tip warning={true}>
+
+このモデルはメンテナンス モードのみであり、コードを変更する新しい PR は受け付けられません。
+
+このモデルの実行中に問題が発生した場合は、このモデルをサポートしていた最後のバージョン (v4.30.0) を再インストールしてください。
+これを行うには、コマンド `pip install -U Transformers==4.30.0` を実行します。
+
+</Tip>
+
+## Overview
+
+BORT モデルは、[Optimal Subarchitecture Extraction for BERT](https://arxiv.org/abs/2010.10499) で提案されました。
+Adrian de Wynter and Daniel J. Perry.これは、BERT のアーキテクチャ パラメータの最適なサブセットです。
+著者は「ボルト」と呼んでいます。
+
+論文の要約は次のとおりです。
+
+*Devlin らから BERT アーキテクチャのアーキテクチャ パラメータの最適なサブセットを抽出します。 (2018)
+ニューラル アーキテクチャ検索のアルゴリズムにおける最近の画期的な技術を適用します。この最適なサブセットを次のように呼びます。
+"Bort" は明らかに小さく、有効 (つまり、埋め込み層を考慮しない) サイズは 5.5% です。
+オリジナルの BERT 大規模アーキテクチャ、およびネット サイズの 16%。 Bort は 288 GPU 時間で事前トレーニングすることもできます。
+最高パフォーマンスの BERT パラメトリック アーキテクチャ バリアントである RoBERTa-large の事前トレーニングに必要な時間の 1.2%
+(Liu et al., 2019)、同じマシンで BERT-large をトレーニングするのに必要な GPU 時間の世界記録の約 33%
+ハードウェア。また、CPU 上で 7.9 倍高速であるだけでなく、他の圧縮バージョンよりもパフォーマンスが優れています。
+アーキテクチャ、および一部の非圧縮バリアント: 0.3% ～ 31% のパフォーマンス向上が得られます。
+BERT-large に関して、複数の公開自然言語理解 (NLU) ベンチマークにおける絶対的な評価。*
+
+このモデルは [stefan-it](https://huggingface.co/stefan-it) によって提供されました。元のコードは[ここ](https://github.com/alexa/bort/)にあります。
+
+## Usage tips
+
+- BORT のモデル アーキテクチャは BERT に基づいています。詳細については、[BERT のドキュメント ページ](bert) を参照してください。
+  モデルの API リファレンスと使用例。
+- BORT は BERT トークナイザーの代わりに RoBERTa トークナイザーを使用します。トークナイザーの API リファレンスと使用例については、[RoBERTa のドキュメント ページ](roberta) を参照してください。
+- BORT には、 [Agora](https://adewynter.github.io/notes/bort_algorithms_and_applications.html#fine-tuning-with-algebraic-topology) と呼ばれる特定の微調整アルゴリズムが必要です。
+  残念ながらまだオープンソース化されていません。誰かが実装しようとすると、コミュニティにとって非常に役立ちます。
+  BORT の微調整を機能させるためのアルゴリズム。
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/bridgetower.md b/docs/source/ja/model_doc/bridgetower.md
new file mode 100644
index 00000000000000..12be1fcc26c6ca
--- /dev/null
+++ b/docs/source/ja/model_doc/bridgetower.md
@@ -0,0 +1,171 @@
+<!--Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BridgeTower
+
+## Overview
+
+BridgeTower モデルは、Xiao Xu、Chenfei Wu、Shachar Rosenman、Vasudev Lal、Wanxiang Che、Nan Duan [BridgeTower: Building Bridges Between Encoders in Vision-Language Representative Learning](https://arxiv.org/abs/2206.08657) で提案されました。ドゥアン。このモデルの目標は、
+各ユニモーダル エンコーダとクロスモーダル エンコーダの間のブリッジにより、クロスモーダル エンコーダの各層での包括的かつ詳細な対話が可能になり、追加のパフォーマンスと計算コストがほとんど無視できる程度で、さまざまな下流タスクで優れたパフォーマンスを実現します。
+
+この論文は [AAAI'23](https://aaai.org/Conferences/AAAI-23/) 会議に採択されました。
+
+論文の要約は次のとおりです。
+
+*TWO-TOWER アーキテクチャを備えたビジョン言語 (VL) モデルは、近年の視覚言語表現学習の主流となっています。
+現在の VL モデルは、軽量のユニモーダル エンコーダーを使用して、ディープ クロスモーダル エンコーダーで両方のモダリティを同時に抽出、位置合わせ、融合することを学習するか、事前にトレーニングされたディープ ユニモーダル エンコーダーから最終層のユニモーダル表現を上部のクロスモーダルエンコーダー。
+どちらのアプローチも、視覚言語表現の学習を制限し、モデルのパフォーマンスを制限する可能性があります。この論文では、ユニモーダル エンコーダの最上位層とクロスモーダル エンコーダの各層の間の接続を構築する複数のブリッジ層を導入する BRIDGETOWER を提案します。
+これにより、効果的なボトムアップのクロスモーダル調整と、クロスモーダル エンコーダー内の事前トレーニング済みユニモーダル エンコーダーのさまざまなセマンティック レベルの視覚表現とテキスト表現の間の融合が可能になります。 BRIDGETOWER は 4M 画像のみで事前トレーニングされており、さまざまな下流の視覚言語タスクで最先端のパフォーマンスを実現します。
+特に、VQAv2 テスト標準セットでは、BRIDGETOWER は 78.73% の精度を達成し、同じ事前トレーニング データとほぼ無視できる追加パラメータと計算コストで以前の最先端モデル METER を 1.09% 上回りました。
+特に、モデルをさらにスケーリングすると、BRIDGETOWER は 81.15% の精度を達成し、桁違いに大きなデータセットで事前トレーニングされたモデルを上回りました。*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/bridgetower_architecture%20.jpg"
+alt="drawing" width="600"/>
+
+<small> ブリッジタワー アーキテクチャ。 <a href="https://arxiv.org/abs/2206.08657">元の論文から抜粋。</a> </small>
+
+このモデルは、[Anahita Bhiwandiwalla](https://huggingface.co/anahita-b)、[Tiep Le](https://huggingface.co/Tile)、[Shaoyen Tseng](https://huggingface.co/shaoyent）。元のコードは [ここ](https://github.com/microsoft/BridgeTower) にあります。
+
+## Usage tips and examples
+
+BridgeTower は、ビジュアル エンコーダー、テキスト エンコーダー、および複数の軽量ブリッジ レイヤーを備えたクロスモーダル エンコーダーで構成されます。
+このアプローチの目標は、各ユニモーダル エンコーダーとクロスモーダル エンコーダーの間にブリッジを構築し、クロスモーダル エンコーダーの各層で包括的かつ詳細な対話を可能にすることでした。
+原則として、提案されたアーキテクチャでは、任意のビジュアル、テキスト、またはクロスモーダル エンコーダを適用できます。
+
+[`BridgeTowerProcessor`] は、[`RobertaTokenizer`] と [`BridgeTowerImageProcessor`] を単一のインスタンスにラップし、両方の機能を実現します。
+テキストをエンコードし、画像をそれぞれ用意します。
+
+次の例は、[`BridgeTowerProcessor`] と [`BridgeTowerForContrastiveLearning`] を使用して対照学習を実行する方法を示しています。
+
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
+>>> import requests
+>>> from PIL import Image
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+>>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
+
+>>> # forward pass
+>>> scores = dict()
+>>> for text in texts:
+...     # prepare inputs
+...     encoding = processor(image, text, return_tensors="pt")
+...     outputs = model(**encoding)
+...     scores[text] = outputs
+```
+
+次の例は、[`BridgeTowerProcessor`] と [`BridgeTowerForImageAndTextRetrieval`] を使用して画像テキストの取得を実行する方法を示しています。
+
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
+>>> import requests
+>>> from PIL import Image
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+>>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+>>> # forward pass
+>>> scores = dict()
+>>> for text in texts:
+...     # prepare inputs
+...     encoding = processor(image, text, return_tensors="pt")
+...     outputs = model(**encoding)
+...     scores[text] = outputs.logits[0, 1].item()
+```
+
+次の例は、[`BridgeTowerProcessor`] と [`BridgeTowerForMaskedLM`] を使用してマスクされた言語モデリングを実行する方法を示しています。
+
+```python
+>>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+>>> text = "a <mask> looking out of the window"
+
+>>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+>>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
+
+>>> # prepare inputs
+>>> encoding = processor(image, text, return_tensors="pt")
+
+>>> # forward pass
+>>> outputs = model(**encoding)
+
+>>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())
+
+>>> print(results)
+.a cat looking out of the window.
+```
+
+チップ：
+
+- BridgeTower のこの実装では、[`RobertaTokenizer`] を使用してテキスト埋め込みを生成し、OpenAI の CLIP/ViT モデルを使用して視覚的埋め込みを計算します。
+- 事前トレーニングされた [bridgeTower-base](https://huggingface.co/BridgeTower/bridgetower-base) および [bridgetower マスクされた言語モデリングと画像テキスト マッチング](https://huggingface.co/BridgeTower/bridgetower--base-itm-mlm) のチェックポイント がリリースされました。
+- 画像検索およびその他の下流タスクにおける BridgeTower のパフォーマンスについては、[表 5](https://arxiv.org/pdf/2206.08657.pdf) を参照してください。
+- このモデルの PyTorch バージョンは、torch 1.10 以降でのみ使用できます。
+
+## BridgeTowerConfig
+
+[[autodoc]] BridgeTowerConfig
+
+## BridgeTowerTextConfig
+
+[[autodoc]] BridgeTowerTextConfig
+
+## BridgeTowerVisionConfig
+
+[[autodoc]] BridgeTowerVisionConfig
+
+## BridgeTowerImageProcessor
+
+[[autodoc]] BridgeTowerImageProcessor
+    - preprocess
+
+## BridgeTowerProcessor
+
+[[autodoc]] BridgeTowerProcessor
+    - __call__
+
+## BridgeTowerModel
+
+[[autodoc]] BridgeTowerModel
+    - forward
+
+## BridgeTowerForContrastiveLearning
+
+[[autodoc]] BridgeTowerForContrastiveLearning
+    - forward
+
+## BridgeTowerForMaskedLM
+
+[[autodoc]] BridgeTowerForMaskedLM
+    - forward
+
+## BridgeTowerForImageAndTextRetrieval
+
+[[autodoc]] BridgeTowerForImageAndTextRetrieval
+    - forward
+
diff --git a/docs/source/ja/model_doc/bros.md b/docs/source/ja/model_doc/bros.md
new file mode 100644
index 00000000000000..3749a172a8286b
--- /dev/null
+++ b/docs/source/ja/model_doc/bros.md
@@ -0,0 +1,113 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# BROS
+
+## Overview
+
+BROS モデルは、Teakgyu Hon、Donghyun Kim、Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park によって [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) で提案されました。 
+
+BROS は *BERT Relying On Spatality* の略です。これは、一連のトークンとその境界ボックスを入力として受け取り、一連の隠れ状態を出力するエンコーダー専用の Transformer モデルです。 BROS は、絶対的な空間情報を使用する代わりに、相対的な空間情報をエンコードします。
+
+BERT で使用されるトークンマスク言語モデリング目標 (TMLM) と新しいエリアマスク言語モデリング目標 (AMLM) の 2 つの目標で事前トレーニングされています。
+TMLM では、トークンはランダムにマスクされ、モデルは空間情報と他のマスクされていないトークンを使用してマスクされたトークンを予測します。
+AMLM は TMLM の 2D バージョンです。テキスト トークンをランダムにマスクし、TMLM と同じ情報で予測しますが、テキスト ブロック (領域) をマスクします。
+
+`BrosForTokenClassification`には、BrosModel の上に単純な線形層があります。各トークンのラベルを予測します。
+`BrosSpadeEEForTokenClassification`には、BrosModel の上に`initial_token_classifier`と`subsequent_token_classifier`があります。 `initial_token_classifier` は各エンティティの最初のトークンを予測するために使用され、`subsequent_token_classifier` はエンティティ内の次のトークンを予測するために使用されます。 `BrosSpadeELForTokenClassification`には BrosModel の上に`entity_linker`があります。 `entity_linker` は 2 つのエンティティ間の関係を予測するために使用されます。
+
+`BrosForTokenClassification`と`BrosSpadeEEForTokenClassification`は基本的に同じジョブを実行します。ただし、`BrosForTokenClassification`は入力トークンが完全にシリアル化されていることを前提としています (トークンは 2D 空間に存在するため、これは非常に困難な作業です)。一方、`BrosSpadeEEForTokenClassification`は 1 つのトークンから次の接続トークンを予測するため、シリアル化エラーの処理をより柔軟に行うことができます。
+
+`BrosSpadeELForTokenClassification` はエンティティ内のリンク タスクを実行します。これら 2 つのエンティティが何らかの関係を共有する場合、(あるエンティティの) 1 つのトークンから (別のエンティティの) 別のトークンへの関係を予測します。
+
+BROS は、明示的な視覚機能に依存せずに、FUNSD、SROIE、CORD、SciTSR などの Key Information Extraction (KIE) ベンチマークで同等以上の結果を達成します。
+
+論文の要約は次のとおりです。
+
+*文書画像からの重要情報抽出 (KIE) には、2 次元 (2D) 空間におけるテキストの文脈的および空間的意味論を理解する必要があります。最近の研究の多くは、文書画像の視覚的特徴とテキストおよびそのレイアウトを組み合わせることに重点を置いた事前トレーニング済み言語モデルを開発することで、この課題を解決しようとしています。一方、このペーパーでは、テキストとレイアウトの効果的な組み合わせという基本に立ち返ってこの問題に取り組みます。具体的には、BROS (BERT Relying On Spatality) という名前の事前トレーニング済み言語モデルを提案します。この言語モデルは、2D 空間内のテキストの相対位置をエンコードし、エリア マスキング戦略を使用してラベルのないドキュメントから学習します。 2D 空間内のテキストを理解するためのこの最適化されたトレーニング スキームにより、BROS は、視覚的な特徴に依存することなく、4 つの KIE ベンチマーク (FUNSD、SROIE*、CORD、および SciTSR) で以前の方法と比較して同等以上のパフォーマンスを示しました。また、この論文では、KIE タスクにおける 2 つの現実世界の課題 ((1) 間違ったテキスト順序によるエラーの最小化、および (2) 少数の下流例からの効率的な学習) を明らかにし、以前の方法に対する BROS の優位性を実証します。*
+
+このモデルは [jinho8345](https://huggingface.co/jinho8345) によって寄稿されました。元のコードは [ここ](https://github.com/clovaai/bros) にあります。
+
+## Usage tips and examples
+
+- [`~transformers.BrosModel.forward`] には、`input_ids` と `bbox` (バウンディング ボックス) が必要です。各境界ボックスは、(x0、y0、x1、y1) 形式 (左上隅、右下隅) である必要があります。境界ボックスの取得は外部 OCR システムに依存します。 「x」座標はドキュメント画像の幅で正規化する必要があり、「y」座標はドキュメント画像の高さで正規化する必要があります。
+
+```python
+def expand_and_normalize_bbox(bboxes, doc_width, doc_height):
+    # here, bboxes are numpy array
+
+    # Normalize bbox -> 0 ~ 1
+    bboxes[:, [0, 2]] = bboxes[:, [0, 2]] / width
+    bboxes[:, [1, 3]] = bboxes[:, [1, 3]] / height
+```
+
+- [`~transformers.BrosForTokenClassification.forward`、`~transformers.BrosSpadeEEForTokenClassification.forward`、`~transformers.BrosSpadeEEForTokenClassification.forward`] では、損失計算に `input_ids` と `bbox` だけでなく `box_first_token_mask` も必要です。これは、各ボックスの先頭以外のトークンを除外するためのマスクです。このマスクは、単語から `input_ids` を作成するときに境界ボックスの開始トークン インデックスを保存することで取得できます。次のコードで`box_first_token_mask`を作成できます。
+
+```python
+def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512):
+
+    box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_)
+
+    # encode(tokenize) each word from words (List[str])
+    input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words]
+
+    # get the length of each box
+    tokens_length_list: List[int] = [len(l) for l in input_ids_list]
+
+    box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list)))
+    box_start_token_indices = box_end_token_indices - np.array(tokens_length_list)
+
+    # filter out the indices that are out of max_seq_length
+    box_end_token_indices = box_end_token_indices[box_end_token_indices < max_seq_length - 1]
+    if len(box_start_token_indices) > len(box_end_token_indices):
+        box_start_token_indices = box_start_token_indices[: len(box_end_token_indices)]
+
+    # set box_start_token_indices to True
+    box_first_token_mask[box_start_token_indices] = True
+
+    return box_first_token_mask
+
+```
+
+## Resources
+
+- デモ スクリプトは [こちら](https://github.com/clovaai/bros) にあります。
+
+## BrosConfig
+
+[[autodoc]] BrosConfig
+
+## BrosProcessor
+
+[[autodoc]] BrosProcessor
+    - __call__
+
+## BrosModel
+
+[[autodoc]] BrosModel
+    - forward
+
+
+## BrosForTokenClassification
+
+[[autodoc]] BrosForTokenClassification
+    - forward
+
+## BrosSpadeEEForTokenClassification
+
+[[autodoc]] BrosSpadeEEForTokenClassification
+    - forward
+
+## BrosSpadeELForTokenClassification
+
+[[autodoc]] BrosSpadeELForTokenClassification
+    - forward
diff --git a/docs/source/ja/model_doc/byt5.md b/docs/source/ja/model_doc/byt5.md
new file mode 100644
index 00000000000000..c6796f981817dc
--- /dev/null
+++ b/docs/source/ja/model_doc/byt5.md
@@ -0,0 +1,154 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ByT5
+
+## Overview
+
+ByT5 モデルは、[ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir
+Kale, Adam Roberts, Colin Raffel.
+
+論文の要約は次のとおりです。
+
+*最も広く使用されている事前トレーニング済み言語モデルは、単語またはサブワード単位に対応するトークンのシーケンスで動作します。
+テキストをトークンのシーケンスとしてエンコードするには、トークナイザーが必要です。トークナイザーは通常、
+モデル。代わりに生のテキスト (バイトまたは文字) を直接操作するトークンフリー モデルには多くの利点があります。
+すぐに使用できるあらゆる言語のテキストを処理でき、ノイズに対してより堅牢であり、技術的負債を最小限に抑えます。
+複雑でエラーが発生しやすいテキスト前処理パイプラインを削除します。バイトまたは文字列がトークンより長いため
+トークンフリー モデルに関する過去の研究では、シーケンスのコストを償却するように設計された新しいモデル アーキテクチャが導入されることがよくありました。
+生のテキストを直接操作します。この論文では、標準的な Transformer アーキテクチャが次のようなもので使用できることを示します。
+バイトシーケンスを処理するための最小限の変更。パラメータ数の観点からトレードオフを注意深く特徴付けます。
+FLOP のトレーニングと推論速度を調べ、バイトレベルのモデルがトークンレベルと競合できることを示します。
+対応者。また、バイトレベルのモデルはノイズに対して大幅に堅牢であり、より優れたパフォーマンスを発揮することも示しています。
+スペルと発音に敏感なタスク。私たちの貢献の一環として、新しいセットをリリースします。
+T5 アーキテクチャに基づいた事前トレーニング済みのバイトレベルの Transformer モデルと、そこで使用されるすべてのコードとデータ
+実験。*
+
+このモデルは、[patrickvonplaten](https://huggingface.co/patrickvonplaten) によって提供されました。元のコードは次のとおりです
+[ここ](https://github.com/google-research/byt5) にあります。
+
+<Tip>
+
+ByT5 のアーキテクチャは T5v1.1 モデルに基づいています。API リファレンスについては、[T5v1.1 のドキュメント ページ](t5v1.1) を参照してください。彼らは
+モデルの入力を準備する方法が異なるだけです。以下のコード例を参照してください。
+
+</Tip>
+
+ByT5 は教師なしで事前トレーニングされているため、単一タスク中にタスク プレフィックスを使用する利点はありません。
+微調整。マルチタスクの微調整を行う場合は、プレフィックスを使用する必要があります。
+
+## Usage Examples
+
+ByT5 は生の UTF-8 バイトで動作するため、トークナイザーなしで使用できます。
+
+```python
+>>> from transformers import T5ForConditionalGeneration
+>>> import torch
+
+>>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+
+>>> num_special_tokens = 3
+>>> # Model has 3 special tokens which take up the input ids 0,1,2 of ByT5.
+>>> # => Need to shift utf-8 character encodings by 3 before passing ids to model.
+
+>>> input_ids = torch.tensor([list("Life is like a box of chocolates.".encode("utf-8"))]) + num_special_tokens
+
+>>> labels = torch.tensor([list("La vie est comme une boîte de chocolat.".encode("utf-8"))]) + num_special_tokens
+
+>>> loss = model(input_ids, labels=labels).loss
+>>> loss.item()
+2.66
+```
+
+ただし、バッチ推論とトレーニングの場合は、トークナイザーを使用することをお勧めします。
+
+
+```python
+>>> from transformers import T5ForConditionalGeneration, AutoTokenizer
+
+>>> model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
+>>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
+
+>>> model_inputs = tokenizer(
+...     ["Life is like a box of chocolates.", "Today is Monday."], padding="longest", return_tensors="pt"
+... )
+>>> labels_dict = tokenizer(
+...     ["La vie est comme une boîte de chocolat.", "Aujourd'hui c'est lundi."], padding="longest", return_tensors="pt"
+... )
+>>> labels = labels_dict.input_ids
+
+>>> loss = model(**model_inputs, labels=labels).loss
+>>> loss.item()
+17.9
+```
+
+[T5](t5) と同様に、ByT5 はスパンマスクノイズ除去タスクでトレーニングされました。しかし、
+モデルはキャラクターに直接作用するため、事前トレーニングタスクは少し複雑です
+違う。のいくつかの文字を破損してみましょう
+`"The dog chases a ball in the park."`という文を入力し、ByT5 に予測してもらいます。
+わたしたちのため。
+
+```python
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+>>> import torch
+
+>>> tokenizer = AutoTokenizer.from_pretrained("google/byt5-base")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-base")
+
+>>> input_ids_prompt = "The dog chases a ball in the park."
+>>> input_ids = tokenizer(input_ids_prompt).input_ids
+
+>>> # Note that we cannot add "{extra_id_...}" to the string directly
+>>> # as the Byte tokenizer would incorrectly merge the tokens
+>>> # For ByT5, we need to work directly on the character level
+>>> # Contrary to T5, ByT5 does not use sentinel tokens for masking, but instead
+>>> # uses final utf character ids.
+>>> # UTF-8 is represented by 8 bits and ByT5 has 3 special tokens.
+>>> # => There are 2**8+2 = 259 input ids and mask tokens count down from index 258.
+>>> # => mask to "The dog [258]a ball [257]park."
+
+>>> input_ids = torch.tensor([input_ids[:8] + [258] + input_ids[14:21] + [257] + input_ids[28:]])
+>>> input_ids
+tensor([[ 87, 107, 104,  35, 103, 114, 106,  35, 258,  35, 100,  35, 101, 100, 111, 111, 257,  35, 115, 100, 117, 110,  49,   1]])
+
+>>> # ByT5 produces only one char at a time so we need to produce many more output characters here -> set `max_length=100`.
+>>> output_ids = model.generate(input_ids, max_length=100)[0].tolist()
+>>> output_ids
+[0, 258, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118, 257,  35, 108, 113,  35, 119, 107, 104,  35, 103, 108, 118, 102, 114, 256, 108, 113,  35, 119, 107, 104, 35, 115, 100, 117, 110,  49,  35,  87, 107, 104,  35, 103, 114, 106, 35, 108, 118,  35, 119, 107, 104,  35, 114, 113, 104,  35, 122, 107, 114,  35, 103, 114, 104, 118,  35, 100,  35, 101, 100, 111, 111,  35, 108, 113, 255,  35, 108, 113,  35, 119, 107, 104,  35, 115, 100, 117, 110,  49]
+
+>>> # ^- Note how 258 descends to 257, 256, 255
+
+>>> # Now we need to split on the sentinel tokens, let's write a short loop for this
+>>> output_ids_list = []
+>>> start_token = 0
+>>> sentinel_token = 258
+>>> while sentinel_token in output_ids:
+...     split_idx = output_ids.index(sentinel_token)
+...     output_ids_list.append(output_ids[start_token:split_idx])
+...     start_token = split_idx
+...     sentinel_token -= 1
+
+>>> output_ids_list.append(output_ids[start_token:])
+>>> output_string = tokenizer.batch_decode(output_ids_list)
+>>> output_string
+['<pad>', 'is the one who does', ' in the disco', 'in the park. The dog is the one who does a ball in', ' in the park.']
+```
+
+## ByT5Tokenizer
+
+[[autodoc]] ByT5Tokenizer
+
+詳細については、[`ByT5Tokenizer`] を参照してください。
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/camembert.md b/docs/source/ja/model_doc/camembert.md
new file mode 100644
index 00000000000000..db8e0aa936936d
--- /dev/null
+++ b/docs/source/ja/model_doc/camembert.md
@@ -0,0 +1,135 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CamemBERT
+
+## Overview
+
+CamemBERT モデルは、[CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) で提案されました。
+Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+Clergerie, Djamé Seddah, and Benoît Sagot. 2019年にリリースされたFacebookのRoBERTaモデルをベースにしたモデルです。
+138GBのフランス語テキストでトレーニングされました。
+
+論文の要約は次のとおりです。
+
+*事前トレーニングされた言語モデルは現在、自然言語処理で広く普及しています。成功にもかかわらず、利用可能なほとんどの
+モデルは英語のデータ、または複数言語のデータの連結でトレーニングされています。これにより、
+このようなモデルの実際の使用は、英語を除くすべての言語で非常に限られています。フランス人にとってこの問題に対処することを目指して、
+Bi-direction Encoders for Transformers (BERT) のフランス語版である CamemBERT をリリースします。測定します
+複数の下流タスク、つまり品詞タグ付けにおける多言語モデルと比較した CamemBERT のパフォーマンス
+依存関係解析、固有表現認識、自然言語推論。 CamemBERT は最先端技術を向上させます
+検討されているほとんどのタスクに対応します。私たちは、研究と
+フランス語 NLP の下流アプリケーション。*
+
+このモデルは [camembert](https://huggingface.co/camembert) によって提供されました。元のコードは [ここ](https://camembert-model.fr/) にあります。
+
+
+<Tip>
+
+この実装はRoBERTaと同じです。使用例については[RoBERTaのドキュメント](roberta)も参照してください。
+入力と出力に関する情報として。
+
+</Tip>
+
+## Resources
+
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [トークン分類タスクガイド](../tasks/token_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+- [マスク言語モデリング タスク ガイド](../tasks/masked_language_modeling)
+- [多肢選択タスク ガイド](../tasks/multiple_choice)
+
+## CamembertConfig
+
+[[autodoc]] CamembertConfig
+
+## CamembertTokenizer
+
+[[autodoc]] CamembertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CamembertTokenizerFast
+
+[[autodoc]] CamembertTokenizerFast
+
+<frameworkcontent>
+<pt>
+
+## CamembertModel
+
+[[autodoc]] CamembertModel
+
+## CamembertForCausalLM
+
+[[autodoc]] CamembertForCausalLM
+
+## CamembertForMaskedLM
+
+[[autodoc]] CamembertForMaskedLM
+
+## CamembertForSequenceClassification
+
+[[autodoc]] CamembertForSequenceClassification
+
+## CamembertForMultipleChoice
+
+[[autodoc]] CamembertForMultipleChoice
+
+## CamembertForTokenClassification
+
+[[autodoc]] CamembertForTokenClassification
+
+## CamembertForQuestionAnswering
+
+[[autodoc]] CamembertForQuestionAnswering
+
+</pt>
+<tf>
+
+## TFCamembertModel
+
+[[autodoc]] TFCamembertModel
+
+## TFCamembertForCasualLM
+
+[[autodoc]] TFCamembertForCausalLM
+
+## TFCamembertForMaskedLM
+
+[[autodoc]] TFCamembertForMaskedLM
+
+## TFCamembertForSequenceClassification
+
+[[autodoc]] TFCamembertForSequenceClassification
+
+## TFCamembertForMultipleChoice
+
+[[autodoc]] TFCamembertForMultipleChoice
+
+## TFCamembertForTokenClassification
+
+[[autodoc]] TFCamembertForTokenClassification
+
+## TFCamembertForQuestionAnswering
+
+[[autodoc]] TFCamembertForQuestionAnswering
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/model_doc/canine.md b/docs/source/ja/model_doc/canine.md
new file mode 100644
index 00000000000000..18af699967d1ad
--- /dev/null
+++ b/docs/source/ja/model_doc/canine.md
@@ -0,0 +1,144 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CANINE
+
+## Overview
+
+CANINE モデルは、[CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
+Representation](https://arxiv.org/abs/2103.06874)、Jonathan H. Clark、Dan Garrette、Iulia Turc、John Wieting 著。その
+明示的なトークン化ステップ (バイト ペアなど) を使用せずに Transformer をトレーニングする最初の論文の 1 つ
+エンコーディング (BPE、WordPiece または SentencePiece)。代わりに、モデルは Unicode 文字レベルで直接トレーニングされます。
+キャラクターレベルでのトレーニングでは必然的にシーケンスの長さが長くなりますが、CANINE はこれを効率的な方法で解決します。
+ディープ Transformer エンコーダを適用する前に、ダウンサンプリング戦略を実行します。
+
+論文の要約は次のとおりです。
+
+*パイプライン NLP システムは、エンドツーエンドのニューラル モデリングに大部分が取って代わられていますが、一般的に使用されているほぼすべてのモデルは
+依然として明示的なトークン化手順が必要です。最近のトークン化アプローチはデータ由来のサブワードに基づいていますが、
+レキシコンは手動で作成されたトークナイザーよりも脆弱ではありませんが、これらの技術はすべての言語に等しく適しているわけではありません。
+言語や固定語彙の使用により、モデルの適応能力が制限される可能性があります。この論文では、CANINE を紹介します。
+明示的なトークン化や語彙を使用せずに、文字シーケンスを直接操作するニューラル エンコーダーと、
+文字に直接作用するか、オプションでサブワードをソフト誘導バイアスとして使用する事前トレーニング戦略。
+よりきめの細かい入力を効果的かつ効率的に使用するために、CANINE はダウンサンプリングを組み合わせて、入力を削減します。
+コンテキストをエンコードするディープトランスフォーマースタックを備えたシーケンスの長さ。 CANINE は、同等の mBERT モデルよりも次の点で優れています。
+TyDi QA の 2.8 F1 は、モデル パラメータが 28% 少ないにもかかわらず、困難な多言語ベンチマークです。*
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。元のコードは [ここ](https://github.com/google-research/language/tree/master/language/canine) にあります。
+
+## Usage tips
+
+- CANINE は内部で少なくとも 3 つの Transformer エンコーダーを使用します: 2 つの「浅い」エンコーダー (単一のエンコーダーのみで構成)
+  レイヤー) と 1 つの「ディープ」エンコーダー (通常の BERT エンコーダー)。まず、「浅い」エンコーダを使用してコンテキストを設定します。
+  ローカル アテンションを使用した文字の埋め込み。次に、ダウンサンプリングの後、「ディープ」エンコーダーが適用されます。ついに、
+  アップサンプリング後、「浅い」エンコーダを使用して最終的な文字埋め込みが作成されます。アップと
+  ダウンサンプリングについては論文に記載されています。
+- CANINE は、デフォルトで 2048 文字の最大シーケンス長を使用します。 [`CanineTokenizer`] を使用できます
+  モデル用のテキストを準備します。
+- 特別な [CLS] トークンの最終的な非表示状態の上に線形レイヤーを配置することで分類を行うことができます。
+  (事前定義された Unicode コード ポイントがあります)。ただし、トークン分類タスクの場合は、ダウンサンプリングされたシーケンス
+  トークンは、元の文字シーケンスの長さ (2048) と一致するように再度アップサンプリングする必要があります。の
+  詳細については、論文を参照してください。
+
+モデルのチェックポイント:
+
+  - [google/canine-c](https://huggingface.co/google/canine-c): 自己回帰文字損失で事前トレーニング済み、
+    12 レイヤー、768 隠し、12 ヘッド、121M パラメーター (サイズ ~500 MB)。
+  - [google/canine-s](https://huggingface.co/google/canine-s): サブワード損失で事前トレーニング済み、12 層、
+    768 個の非表示、12 ヘッド、121M パラメーター (サイズ ~500 MB)。
+
+## Usage example
+
+CANINE は生の文字で動作するため、**トークナイザーなし**で使用できます。
+
+```python
+>>> from transformers import CanineModel
+>>> import torch
+
+>>> model = CanineModel.from_pretrained("google/canine-c")  # model pre-trained with autoregressive character loss
+
+>>> text = "hello world"
+>>> # use Python's built-in ord() function to turn each character into its unicode code point id
+>>> input_ids = torch.tensor([[ord(char) for char in text]])
+
+>>> outputs = model(input_ids)  # forward pass
+>>> pooled_output = outputs.pooler_output
+>>> sequence_output = outputs.last_hidden_state
+```
+
+ただし、バッチ推論とトレーニングの場合は、トークナイザーを使用することをお勧めします（すべてをパディング/切り詰めるため）
+シーケンスを同じ長さにします):
+
+```python
+>>> from transformers import CanineTokenizer, CanineModel
+
+>>> model = CanineModel.from_pretrained("google/canine-c")
+>>> tokenizer = CanineTokenizer.from_pretrained("google/canine-c")
+
+>>> inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
+>>> encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
+
+>>> outputs = model(**encoding)  # forward pass
+>>> pooled_output = outputs.pooler_output
+>>> sequence_output = outputs.last_hidden_state
+```
+
+## Resources
+
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [トークン分類タスクガイド](../tasks/token_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [多肢選択タスク ガイド](../tasks/multiple_choice)
+
+## CanineConfig
+
+[[autodoc]] CanineConfig
+
+## CanineTokenizer
+
+[[autodoc]] CanineTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+
+## CANINE specific outputs
+
+[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling
+
+## CanineModel
+
+[[autodoc]] CanineModel
+    - forward
+
+## CanineForSequenceClassification
+
+[[autodoc]] CanineForSequenceClassification
+    - forward
+
+## CanineForMultipleChoice
+
+[[autodoc]] CanineForMultipleChoice
+    - forward
+
+## CanineForTokenClassification
+
+[[autodoc]] CanineForTokenClassification
+    - forward
+
+## CanineForQuestionAnswering
+
+[[autodoc]] CanineForQuestionAnswering
+    - forward
diff --git a/docs/source/ja/model_doc/chinese_clip.md b/docs/source/ja/model_doc/chinese_clip.md
new file mode 100644
index 00000000000000..8d7dc401d2ae7e
--- /dev/null
+++ b/docs/source/ja/model_doc/chinese_clip.md
@@ -0,0 +1,112 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Chinese-CLIP
+
+## Overview
+
+Chinese-CLIP An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) で提案されました。周、張周。
+Chinese-CLIP は、中国語の画像とテキストのペアの大規模なデータセットに対する CLIP (Radford et al., 2021) の実装です。クロスモーダル検索を実行できるほか、ゼロショット画像分類、オープンドメインオブジェクト検出などのビジョンタスクのビジョンバックボーンとしても機能します。オリジナルの中国語-CLIPコードは[このリンクで](https://github.com/OFA-Sys/Chinese-CLIP)。
+
+論文の要約は次のとおりです。
+
+*CLIP の大成功 (Radford et al., 2021) により、視覚言語の事前訓練のための対照学習の研究と応用が促進されました。この研究では、ほとんどのデータが公開されているデータセットから取得された中国語の画像とテキストのペアの大規模なデータセットを構築し、新しいデータセットで中国語の CLIP モデルを事前トレーニングします。当社では、7,700 万から 9 億 5,800 万のパラメータにわたる、複数のサイズの 5 つの中国 CLIP モデルを開発しています。さらに、モデルのパフォーマンスを向上させるために、最初に画像エンコーダーをフリーズさせてモデルをトレーニングし、次にすべてのパラメーターを最適化してトレーニングする 2 段階の事前トレーニング方法を提案します。私たちの包括的な実験では、中国の CLIP がゼロショット学習と微調整のセットアップで MUGE、Flickr30K-CN、および COCO-CN 上で最先端のパフォーマンスを達成でき、ゼロで競争力のあるパフォーマンスを達成できることを実証しています。 - ELEVATER ベンチマークでの評価に基づくショット画像の分類 (Li et al., 2022)。コード、事前トレーニング済みモデル、デモがリリースされました。*
+
+Chinese-CLIP モデルは、[OFA-Sys](https://huggingface.co/OFA-Sys) によって提供されました。
+
+## Usage example
+
+以下のコード スニペットは、画像とテキストの特徴と類似性を計算する方法を示しています。
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import ChineseCLIPProcessor, ChineseCLIPModel
+
+>>> model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+>>> processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16")
+
+>>> url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> # Squirtle, Bulbasaur, Charmander, Pikachu in English
+>>> texts = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]
+
+>>> # compute image feature
+>>> inputs = processor(images=image, return_tensors="pt")
+>>> image_features = model.get_image_features(**inputs)
+>>> image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True)  # normalize
+
+>>> # compute text features
+>>> inputs = processor(text=texts, padding=True, return_tensors="pt")
+>>> text_features = model.get_text_features(**inputs)
+>>> text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True)  # normalize
+
+>>> # compute image-text similarity scores
+>>> inputs = processor(text=texts, images=image, return_tensors="pt", padding=True)
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]]
+```
+
+現在、次のスケールの事前トレーニング済み Chinese-CLIP モデルが 🤗 Hub で利用可能です。
+
+- [OFA-Sys/chinese-clip-vit-base-patch16](https://huggingface.co/OFA-Sys/chinese-clip-vit-base-patch16)
+- [OFA-Sys/chinese-clip-vit-large-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14)
+- [OFA-Sys/chinese-clip-vit-large-patch14-336px](https://huggingface.co/OFA-Sys/chinese-clip-vit-large-patch14-336px)
+- [OFA-Sys/chinese-clip-vit-huge-patch14](https://huggingface.co/OFA-Sys/chinese-clip-vit-huge-patch14)
+
+## ChineseCLIPConfig
+
+[[autodoc]] ChineseCLIPConfig
+    - from_text_vision_configs
+
+## ChineseCLIPTextConfig
+
+[[autodoc]] ChineseCLIPTextConfig
+
+## ChineseCLIPVisionConfig
+
+[[autodoc]] ChineseCLIPVisionConfig
+
+## ChineseCLIPImageProcessor
+
+[[autodoc]] ChineseCLIPImageProcessor
+    - preprocess
+
+## ChineseCLIPFeatureExtractor
+
+[[autodoc]] ChineseCLIPFeatureExtractor
+
+## ChineseCLIPProcessor
+
+[[autodoc]] ChineseCLIPProcessor
+
+## ChineseCLIPModel
+
+[[autodoc]] ChineseCLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## ChineseCLIPTextModel
+
+[[autodoc]] ChineseCLIPTextModel
+    - forward
+
+## ChineseCLIPVisionModel
+
+[[autodoc]] ChineseCLIPVisionModel
+    - forward
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/clap.md b/docs/source/ja/model_doc/clap.md
new file mode 100644
index 00000000000000..f1e08d76018fac
--- /dev/null
+++ b/docs/source/ja/model_doc/clap.md
@@ -0,0 +1,80 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLAP
+
+## Overview
+
+CLAP モデルは、[Large Scale Contrastive Language-Audio pretraining with
+feature fusion and keyword-to-caption augmentation](https://arxiv.org/pdf/2211.06687.pdf)、Yusong Wu、Ke Chen、Tianyu Zhang、Yuchen Hui、Taylor Berg-Kirkpatrick、Shlomo Dubnov 著。
+
+CLAP (Contrastive Language-Audio Pretraining) は、さまざまな (音声、テキスト) ペアでトレーニングされたニューラル ネットワークです。タスクに合わせて直接最適化することなく、音声が与えられた場合に最も関連性の高いテキスト スニペットを予測するように指示できます。 CLAP モデルは、SWINTransformer を使用して log-Mel スペクトログラム入力からオーディオ特徴を取得し、RoBERTa モデルを使用してテキスト特徴を取得します。次に、テキストとオーディオの両方の特徴が、同じ次元の潜在空間に投影されます。投影されたオーディオとテキストの特徴の間のドット積が、同様のスコアとして使用されます。
+
+論文の要約は次のとおりです。
+
+*対照学習は、マルチモーダル表現学習の分野で目覚ましい成功を収めています。この論文では、音声データと自然言語記述を組み合わせて音声表現を開発する、対照的な言語音声事前トレーニングのパイプラインを提案します。この目標を達成するために、私たちはまず、さまざまなデータ ソースからの 633,526 個の音声とテキストのペアの大規模なコレクションである LAION-Audio-630K をリリースします。次に、さまざまなオーディオ エンコーダとテキスト エンコーダを考慮して、対照的な言語とオーディオの事前トレーニング モデルを構築します。機能融合メカニズムとキーワードからキャプションへの拡張をモデル設計に組み込んで、モデルが可変長の音声入力を処理できるようにし、パフォーマンスを向上させます。 3 番目に、包括的な実験を実行して、テキストから音声への取得、ゼロショット音声分類、教師付き音声分類の 3 つのタスクにわたってモデルを評価します。結果は、私たちのモデルがテキストから音声への検索タスクにおいて優れたパフォーマンスを達成していることを示しています。オーディオ分類タスクでは、モデルはゼロショット設定で最先端のパフォーマンスを達成し、非ゼロショット設定でもモデルの結果に匹敵するパフォーマンスを得ることができます。 LAION-オーディオ-6*
+
+このモデルは、[Younes Belkada](https://huggingface.co/ybelkada) および [Arthur Zucker](https://huggingface.co/ArthurZ) によって提供されました。
+元のコードは [こちら](https://github.com/LAION-AI/Clap) にあります。
+
+## ClapConfig
+
+[[autodoc]] ClapConfig
+    - from_text_audio_configs
+
+## ClapTextConfig
+
+[[autodoc]] ClapTextConfig
+
+## ClapAudioConfig
+
+[[autodoc]] ClapAudioConfig
+
+## ClapFeatureExtractor
+
+[[autodoc]] ClapFeatureExtractor
+
+## ClapProcessor
+
+[[autodoc]] ClapProcessor
+
+## ClapModel
+
+[[autodoc]] ClapModel
+    - forward
+    - get_text_features
+    - get_audio_features
+
+## ClapTextModel
+
+[[autodoc]] ClapTextModel
+    - forward
+
+## ClapTextModelWithProjection
+
+[[autodoc]] ClapTextModelWithProjection
+    - forward
+
+## ClapAudioModel
+
+[[autodoc]] ClapAudioModel
+    - forward
+
+## ClapAudioModelWithProjection
+
+[[autodoc]] ClapAudioModelWithProjection
+    - forward
+    
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/clip.md b/docs/source/ja/model_doc/clip.md
new file mode 100644
index 00000000000000..741d240b0ce9c8
--- /dev/null
+++ b/docs/source/ja/model_doc/clip.md
@@ -0,0 +1,220 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLIP
+
+## Overview
+
+CLIP モデルは、Alec Radford、Jong Wook Kim、Chris Hallacy、Aditya Ramesh、Gabriel Goh Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) で提案されました。
+サンディニ・アガルワル、ギリッシュ・サストリー、アマンダ・アスケル、パメラ・ミシュキン、ジャック・クラーク、グレッチェン・クルーガー、イリヤ・サツケヴァー。クリップ
+(Contrastive Language-Image Pre-Training) は、さまざまな (画像、テキスト) ペアでトレーニングされたニューラル ネットワークです。かもね
+直接最適化することなく、与えられた画像から最も関連性の高いテキスト スニペットを予測するように自然言語で指示されます。
+GPT-2 および 3 のゼロショット機能と同様に、タスクに対して。
+
+論文の要約は次のとおりです。
+
+*最先端のコンピューター ビジョン システムは、あらかじめ定められたオブジェクト カテゴリの固定セットを予測するようにトレーニングされています。これ
+制限された形式の監視では、指定するために追加のラベル付きデータが必要となるため、一般性と使いやすさが制限されます。
+その他の視覚的なコンセプト。画像に関する生のテキストから直接学習することは、
+より広範な監督源。どのキャプションが表示されるかを予測するという単純な事前トレーニング タスクが有効であることを示します。
+400 のデータセットで SOTA 画像表現を最初から学習するための効率的かつスケーラブルな方法はどの画像ですか
+インターネットから収集された数百万の（画像、テキスト）ペア。事前トレーニング後、自然言語を使用して参照します。
+視覚的な概念を学習し（または新しい概念を説明し）、下流のタスクへのモデルのゼロショット転送を可能にします。私たちは勉強します
+30 を超えるさまざまな既存のコンピューター ビジョン データセットでタスクをまたがってベンチマークを行うことにより、このアプローチのパフォーマンスを評価します。
+OCR、ビデオ内のアクション認識、地理的位置特定、およびさまざまな種類のきめ細かいオブジェクト分類など。の
+モデルはほとんどのタスクに簡単に移行でき、多くの場合、必要がなくても完全に監視されたベースラインと競合します。
+データセット固有のトレーニングに適しています。たとえば、ImageNet ゼロショットではオリジナルの ResNet-50 の精度と一致します。
+トレーニングに使用された 128 万のトレーニング サンプルを使用する必要はありません。コードをリリースし、事前トレーニング済み
+モデルの重みはこの https URL で確認できます。*
+
+このモデルは [valhalla](https://huggingface.co/valhalla) によって提供されました。元のコードは [ここ](https://github.com/openai/CLIP) にあります。
+
+## Usage tips and example
+
+CLIP は、マルチモーダルなビジョンおよび言語モデルです。画像とテキストの類似性やゼロショット画像に使用できます。
+分類。 CLIP は、ViT のようなトランスフォーマーを使用して視覚的特徴を取得し、因果言語モデルを使用してテキストを取得します
+特徴。次に、テキストと視覚の両方の特徴が、同じ次元の潜在空間に投影されます。ドット
+投影された画像とテキストの特徴間の積が同様のスコアとして使用されます。
+
+画像を Transformer エンコーダに供給するために、各画像は固定サイズの重複しないパッチのシーケンスに分割されます。
+これらは線形に埋め込まれます。 [CLS] トークンは、イメージ全体の表現として機能するために追加されます。作家たち
+また、絶対位置埋め込みを追加し、結果として得られるベクトルのシーケンスを標準の Transformer エンコーダに供給します。
+[`CLIPImageProcessor`] を使用して、モデルの画像のサイズ変更 (または再スケール) および正規化を行うことができます。
+
+[`CLIPTokenizer`] はテキストのエンコードに使用されます。 [`CLIPProcessor`] はラップします
+[`CLIPImageProcessor`] と [`CLIPTokenizer`] を両方の単一インスタンスに統合
+テキストをエンコードして画像を準備します。次の例は、次のメソッドを使用して画像とテキストの類似性スコアを取得する方法を示しています。
+[`CLIPProcessor`] と [`CLIPModel`]。
+
+```python
+>>> from PIL import Image
+>>> import requests
+
+>>> from transformers import CLIPProcessor, CLIPModel
+
+>>> model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+>>> processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True)
+
+>>> outputs = model(**inputs)
+>>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+>>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+```
+
+## Resources
+
+CLIP を使い始めるのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。
+
+- [リモート センシング (衛星) 画像とキャプションを使用した CLIP の微調整](https://huggingface.co/blog/fine-tune-clip-rsicd)、[RSICD データセット] を使用して CLIP を微調整する方法に関するブログ投稿(https://github.com/201528014227051/RSICD_optimal) と、データ拡張によるパフォーマンスの変化の比較。
+- この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/contrastive-image-text) は、プレ- [COCO データセット](https://cocodataset.org/#home) を使用してトレーニングされたビジョンおよびテキスト エンコーダー。
+
+<PipelineTag pipeline="image-to-text"/>
+
+- 画像キャプションのビーム検索による推論に事前トレーニング済み CLIP を使用する方法に関する [ノートブック](https://colab.research.google.com/drive/1tuoAC5F4sC7qid56Z0ap-stR3rwdk0ZV?usp=sharing)。 🌎
+
+**画像検索**
+
+- 事前トレーニングされた CLIP を使用した画像検索と MRR (平均相互ランク) スコアの計算に関する [ノートブック](https://colab.research.google.com/drive/1bLVwVKpAndpEDHqjzxVPr_9nGrSbuOQd?usp=sharing)。 🌎
+- 画像の取得と類似性スコアの表示に関する [ノートブック](https://colab.research.google.com/github/deep-diver/image_search_with_natural_language/blob/main/notebooks/Image_Search_CLIP.ipynb)。 🌎
+- 多言語 CLIP を使用して画像とテキストを同じベクトル空間にマッピングする方法に関する [ノートブック](https://colab.research.google.com/drive/1xO-wC_m_GNzgjIBQ4a4znvQkvDoZJvH4?usp=sharing)。 🌎
+- を使用してセマンティック イメージ検索で CLIP を実行する方法に関する [ノートブック](https://colab.research.google.com/github/vivien000/clip-demo/blob/master/clip.ipynb#scrollTo=uzdFhRGqiWkR) [Unsplash](https://unsplash.com) および [TMBD](https://www.themoviedb.org/) データセット。 🌎
+
+**説明可能性**
+
+- 入力トークンと画像セグメントの類似性を視覚化する方法に関する [ノートブック](https://colab.research.google.com/github/hila-chefer/Transformer-MM-Explainability/blob/main/CLIP_explainability.ipynb)。 🌎
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。
+リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## CLIPConfig
+
+[[autodoc]] CLIPConfig
+    - from_text_vision_configs
+
+## CLIPTextConfig
+
+[[autodoc]] CLIPTextConfig
+
+## CLIPVisionConfig
+
+[[autodoc]] CLIPVisionConfig
+
+## CLIPTokenizer
+
+[[autodoc]] CLIPTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CLIPTokenizerFast
+
+[[autodoc]] CLIPTokenizerFast
+
+## CLIPImageProcessor
+
+[[autodoc]] CLIPImageProcessor
+    - preprocess
+
+## CLIPFeatureExtractor
+
+[[autodoc]] CLIPFeatureExtractor
+
+## CLIPProcessor
+
+[[autodoc]] CLIPProcessor
+
+<frameworkcontent>
+<pt>
+
+## CLIPModel
+
+[[autodoc]] CLIPModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPTextModel
+
+[[autodoc]] CLIPTextModel
+    - forward
+
+## CLIPTextModelWithProjection
+
+[[autodoc]] CLIPTextModelWithProjection
+    - forward
+
+## CLIPVisionModelWithProjection
+
+[[autodoc]] CLIPVisionModelWithProjection
+    - forward
+
+## CLIPVisionModel
+
+[[autodoc]] CLIPVisionModel
+    - forward
+
+</pt>
+<tf>
+
+## TFCLIPModel
+
+[[autodoc]] TFCLIPModel
+    - call
+    - get_text_features
+    - get_image_features
+
+## TFCLIPTextModel
+
+[[autodoc]] TFCLIPTextModel
+    - call
+
+## TFCLIPVisionModel
+
+[[autodoc]] TFCLIPVisionModel
+    - call
+
+</tf>
+<jax>
+
+## FlaxCLIPModel
+
+[[autodoc]] FlaxCLIPModel
+    - __call__
+    - get_text_features
+    - get_image_features
+
+## FlaxCLIPTextModel
+
+[[autodoc]] FlaxCLIPTextModel
+    - __call__
+
+## FlaxCLIPTextModelWithProjection
+
+[[autodoc]] FlaxCLIPTextModelWithProjection
+    - __call__
+
+## FlaxCLIPVisionModel
+
+[[autodoc]] FlaxCLIPVisionModel
+    - __call__
+
+</jax>
+</frameworkcontent>
diff --git a/docs/source/ja/model_doc/clipseg.md b/docs/source/ja/model_doc/clipseg.md
new file mode 100644
index 00000000000000..c8bdb0a0e4789b
--- /dev/null
+++ b/docs/source/ja/model_doc/clipseg.md
@@ -0,0 +1,104 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLIPSeg
+
+## Overview
+
+CLIPSeg モデルは、Timo Lüddecke, Alexander Ecker によって [Image Segmentation using Text and Image Prompts](https://arxiv.org/abs/2112.10003) で提案されました。
+そしてアレクサンダー・エッカー。 CLIPSeg は、ゼロショットおよびワンショット画像セグメンテーションのために、凍結された [CLIP](clip) モデルの上に最小限のデコーダを追加します。
+
+論文の要約は次のとおりです。
+
+*画像のセグメンテーションは通常、トレーニングによって解決されます。
+オブジェクト クラスの固定セットのモデル。後で追加のクラスやより複雑なクエリを組み込むとコストがかかります
+これらの式を含むデータセットでモデルを再トレーニングする必要があるためです。ここでシステムを提案します
+任意の情報に基づいて画像セグメンテーションを生成できます。
+テスト時にプロンプ​​トが表示されます。プロンプトはテキストまたは
+画像。このアプローチにより、統一されたモデルを作成できます。
+3 つの一般的なセグメンテーション タスクについて (1 回トレーニング済み)
+参照式のセグメンテーション、ゼロショット セグメンテーション、ワンショット セグメンテーションという明確な課題が伴います。
+CLIP モデルをバックボーンとして構築し、これをトランスベースのデコーダで拡張して、高密度なデータ通信を可能にします。
+予測。の拡張バージョンでトレーニングした後、
+PhraseCut データセット、私たちのシステムは、フリーテキスト プロンプトまたは
+クエリを表す追加の画像。後者の画像ベースのプロンプトのさまざまなバリエーションを詳細に分析します。
+この新しいハイブリッド入力により、動的適応が可能になります。
+前述の 3 つのセグメンテーション タスクのみですが、
+テキストまたは画像をクエリするバイナリ セグメンテーション タスクに
+定式化することができる。最後に、システムがうまく適応していることがわかりました
+アフォーダンスまたはプロパティを含む一般化されたクエリ*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/clipseg_architecture.png"
+alt="描画" width="600"/>
+
+<small> CLIPSeg の概要。 <a href="https://arxiv.org/abs/2112.10003">元の論文から抜粋。</a> </small>
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。
+元のコードは [ここ](https://github.com/timojl/clipseg) にあります。
+
+## Usage tips
+
+- [`CLIPSegForImageSegmentation`] は、[`CLIPSegModel`] の上にデコーダを追加します。後者は [`CLIPModel`] と同じです。
+- [`CLIPSegForImageSegmentation`] は、テスト時に任意のプロンプトに基づいて画像セグメンテーションを生成できます。プロンプトはテキストのいずれかです
+(`input_ids` としてモデルに提供される) または画像 (`conditional_pixel_values` としてモデルに提供される)。カスタムを提供することもできます
+条件付き埋め込み (`conditional_embeddings`としてモデルに提供されます)。
+
+## Resources
+
+CLIPSeg の使用を開始するのに役立つ、公式 Hugging Face およびコミュニティ (🌎 で示されている) リソースのリスト。ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+<PipelineTag pipeline="image-segmentation"/>
+
+- [CLIPSeg を使用したゼロショット画像セグメンテーション](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/CLIPSeg/Zero_shot_image_segmentation_with_CLIPSeg.ipynb) を説明するノートブック。
+
+## CLIPSegConfig
+
+[[autodoc]] CLIPSegConfig
+    - from_text_vision_configs
+
+## CLIPSegTextConfig
+
+[[autodoc]] CLIPSegTextConfig
+
+## CLIPSegVisionConfig
+
+[[autodoc]] CLIPSegVisionConfig
+
+## CLIPSegProcessor
+
+[[autodoc]] CLIPSegProcessor
+
+## CLIPSegModel
+
+[[autodoc]] CLIPSegModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## CLIPSegTextModel
+
+[[autodoc]] CLIPSegTextModel
+    - forward
+
+## CLIPSegVisionModel
+
+[[autodoc]] CLIPSegVisionModel
+    - forward
+
+## CLIPSegForImageSegmentation
+
+[[autodoc]] CLIPSegForImageSegmentation
+    - forward
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/clvp.md b/docs/source/ja/model_doc/clvp.md
new file mode 100644
index 00000000000000..0803f5e027ce81
--- /dev/null
+++ b/docs/source/ja/model_doc/clvp.md
@@ -0,0 +1,123 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CLVP
+
+## Overview
+
+CLVP (Contrastive Language-Voice Pretrained Transformer) モデルは、James Betker によって [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) で提案されました。
+
+論文の要約は次のとおりです。
+
+*近年、画像生成の分野は自己回帰変換器と DDPM の応用によって革命を起こしています。これらのアプローチは、画像生成のプロセスを段階的な確率的プロセスとしてモデル化し、大量のコンピューティングとデータを活用して画像の分布を学習します。パフォーマンスを向上させるこの方法論は、画像に限定される必要はありません。この論文では、画像生成ドメインの進歩を音声合成に適用する方法について説明します。その結果、表現力豊かなマルチ音声テキスト読み上げシステムである TorToise が誕生しました。
+
+
+このモデルは [Susnato Dhar](https://huggingface.co/susnato) によって提供されました。
+元のコードは [ここ](https://github.com/neonbjb/tortoise-tts) にあります。
+
+## Usage tips
+
+1. CLVP は Tortoise TTS モデルの不可欠な部分です。
+2. CLVP を使用して、生成されたさまざまな音声候補を提供されたテキストと比較することができ、最良の音声トークンが拡散モデルに転送されます。
+3. Tortoise の使用には、[`ClvpModelForConditionalGeneration.generate()`] メソッドの使用を強くお勧めします。
+4. 16 kHz を期待する他のオーディオ モデルとは対照的に、CLVP モデルはオーディオが 22.05 kHz でサンプリングされることを期待していることに注意してください。
+
+## Brief Explanation:
+
+- [`ClvpTokenizer`] はテキスト入力をトークン化し、[`ClvpFeatureExtractor`] は目的のオーディオからログ メル スペクトログラムを抽出します。
+- [`ClvpConditioningEncoder`] は、これらのテキスト トークンとオーディオ表現を取得し、テキストとオーディオに基づいて条件付けされた埋め込みに変換します。
+- [`ClvpForCausalLM`] は、これらの埋め込みを使用して複数の音声候補を生成します。
+- 各音声候補は音声エンコーダ ([`ClvpEncoder`]) を通過してベクトル表現に変換され、テキスト エンコーダ ([`ClvpEncoder`]) はテキスト トークンを同じ潜在空間に変換します。
+- 最後に、各音声ベクトルをテキスト ベクトルと比較して、どの音声ベクトルがテキスト ベクトルに最も類似しているかを確認します。
+- [`ClvpModelForConditionalGeneration.generate()`] は、上記のすべてのロジックを 1 つのメソッドに圧縮します。
+
+例 ：
+
+```python
+>>> import datasets
+>>> from transformers import ClvpProcessor, ClvpModelForConditionalGeneration
+
+>>> # Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using `datasets` library).
+>>> text = "This is an example text."
+
+>>> ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+>>> ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050))
+>>> sample = ds[0]["audio"]
+
+>>> # Define processor and model.
+>>> processor = ClvpProcessor.from_pretrained("susnato/clvp_dev")
+>>> model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev")
+
+>>> # Generate processor output and model output.
+>>> processor_output = processor(raw_speech=sample["array"], sampling_rate=sample["sampling_rate"], text=text, return_tensors="pt")
+>>> generated_output = model.generate(**processor_output)
+```
+
+
+## ClvpConfig
+
+[[autodoc]] ClvpConfig
+    - from_sub_model_configs
+
+## ClvpEncoderConfig
+
+[[autodoc]] ClvpEncoderConfig
+
+## ClvpDecoderConfig
+
+[[autodoc]] ClvpDecoderConfig
+
+## ClvpTokenizer
+
+[[autodoc]] ClvpTokenizer
+    - save_vocabulary
+
+## ClvpFeatureExtractor
+
+[[autodoc]] ClvpFeatureExtractor
+    - __call__
+
+## ClvpProcessor
+
+[[autodoc]] ClvpProcessor
+    - __call__
+    - decode
+    - batch_decode
+
+## ClvpModelForConditionalGeneration
+
+[[autodoc]] ClvpModelForConditionalGeneration
+    - forward
+    - generate
+    - get_text_features
+    - get_speech_features
+
+## ClvpForCausalLM
+
+[[autodoc]] ClvpForCausalLM
+
+## ClvpModel
+
+[[autodoc]] ClvpModel
+
+## ClvpEncoder
+
+[[autodoc]] ClvpEncoder
+
+## ClvpDecoder
+
+[[autodoc]] ClvpDecoder
+
diff --git a/docs/source/ja/model_doc/code_llama.md b/docs/source/ja/model_doc/code_llama.md
new file mode 100644
index 00000000000000..4ba345b8d7b9c5
--- /dev/null
+++ b/docs/source/ja/model_doc/code_llama.md
@@ -0,0 +1,125 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CodeLlama
+
+## Overview
+
+Code Llama モデルはによって [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) で提案されました。 Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+論文の要約は次のとおりです。
+
+*私たちは Code Llama をリリースします。これは Llama 2 に基づくコードの大規模言語モデル ファミリであり、オープン モデルの中で最先端のパフォーマンス、埋め込み機能、大規模な入力コンテキストのサポート、プログラミング タスクのゼロショット命令追従機能を提供します。 。幅広いアプリケーションをカバーするための複数のフレーバーを提供しています。基盤モデル (Code Llama)、Python 特化 (Code Llama - Python)、およびそれぞれ 7B、13B、および 34B パラメーターを備えた命令追従モデル (Code Llama - Instruct) です。すべてのモデルは 16,000 トークンのシーケンスでトレーニングされ、最大 100,000 トークンの入力で改善が見られます。 7B および 13B コード ラマとコード ラマ - 命令バリアントは、周囲のコンテンツに基づいた埋め込みをサポートします。 Code Llama は、いくつかのコード ベンチマークでオープン モデルの中で最先端のパフォーマンスに達し、HumanEval と MBPP でそれぞれ最大 53% と 55% のスコアを獲得しました。特に、Code Llama - Python 7B は HumanEval および MBPP 上で Llama 2 70B よりも優れたパフォーマンスを示し、すべてのモデルは MultiPL-E 上で公開されている他のすべてのモデルよりも優れています。私たちは、研究と商業利用の両方を許可する寛容なライセンスに基づいて Code Llama をリリースしています。*
+
+すべての Code Llama モデル チェックポイントを [こちら](https://huggingface.co/models?search=code_llama) で確認し、[codellama org](https://huggingface.co/codellama) で正式にリリースされたチェックポイントを確認してください。
+
+このモデルは [ArthurZucker](https://huggingface.co/ArthurZ) によって提供されました。著者のオリジナルのコードは [こちら](https://github.com/facebookresearch/llama) にあります。
+
+## Usage tips and examples
+
+<Tip warning={true}>
+
+Code Llama のベースとなる`Llama2`ファミリー モデルは、`bfloat16`を使用してトレーニングされましたが、元の推論では`float16`を使用します。さまざまな精度を見てみましょう。
+
+* `float32`: モデルの初期化に関する PyTorch の規約では、モデルの重みがどの `dtype` で格納されたかに関係なく、モデルを `float32` にロードします。 「transformers」も、PyTorch との一貫性を保つためにこの規則に従っています。これはデフォルトで選択されます。 `AutoModel` API でストレージの重み付けタイプを使用してチェックポイントのロードをキャストする場合は、`torch_dtype="auto"` を指定する必要があります。 `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`。
+* `bfloat16`: コード Llama はこの精度でトレーニングされているため、さらなるトレーニングや微調整に使用することをお勧めします。
+* `float16`: この精度を使用して推論を実行することをお勧めします。通常は `bfloat16` より高速であり、評価メトリクスには `bfloat16` と比べて明らかな低下が見られないためです。 bfloat16 を使用して推論を実行することもできます。微調整後、float16 と bfloat16 の両方で推論結果を確認することをお勧めします。
+
+上で述べたように、モデルを初期化するときに `torch_dtype="auto"` を使用しない限り、ストレージの重みの `dtype` はほとんど無関係です。その理由は、モデルが最初にダウンロードされ (オンラインのチェックポイントの `dtype` を使用)、次に `torch` のデフォルトの `dtype` にキャストされるためです (`torch.float32` になります)。指定された `torch_dtype` がある場合は、代わりにそれが使用されます。
+
+</Tip>
+
+チップ：
+- 充填タスクはすぐにサポートされます。入力を埋めたい場所には `tokenizer.fill_token` を使用する必要があります。
+- モデル変換スクリプトは、`Llama2` ファミリの場合と同じです。
+
+使用例は次のとおりです。
+
+```bash
+python src/transformers/models/llama/convert_llama_weights_to_hf.py \
+    --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path
+```
+
+スクリプトを実行するには、(最大のバージョンであっても) float16 精度でモデル全体をホストするのに十分な CPU RAM が必要であることに注意してください。
+いくつかのチェックポイントがあり、それぞれにモデルの各重みの一部が含まれているため、すべてを RAM にロードする必要があります)。
+
+変換後、モデルとトークナイザーは次の方法でロードできます。
+
+```python
+>>> from transformers import LlamaForCausalLM, CodeLlamaTokenizer
+
+>>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf")
+>>> PROMPT = '''def remove_non_ascii(s: str) -> str:
+    """ <FILL_ME>
+    return result
+'''
+>>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"]
+>>> generated_ids = model.generate(input_ids, max_new_tokens=128)
+
+>>> filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0]
+>>> print(PROMPT.replace("<FILL_ME>", filling))
+def remove_non_ascii(s: str) -> str:
+    """ Remove non-ASCII characters from a string.
+
+    Args:
+        s: The string to remove non-ASCII characters from.
+
+    Returns:
+        The string with non-ASCII characters removed.
+    """
+    result = ""
+    for c in s:
+        if ord(c) < 128:
+            result += c
+    return result
+```
+
+塗りつぶされた部分だけが必要な場合:
+
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto")
+>>> generator('def remove_non_ascii(s: str) -> str:\n    """ <FILL_ME>\n    return result', max_new_tokens = 128, return_type = 1)
+```
+
+内部では、トークナイザーが [`<FILL_ME>` によって自動的に分割](https://huggingface.co/docs/transformers/main/model_doc/code_llama#transformers.CodeLlamaTokenizer.fill_token) して、[ に続く書式設定された入力文字列を作成します。オリジナルのトレーニング パターン](https://github.com/facebookresearch/codellama/blob/cb51c14ec761370ba2e2bc351374a79265d0465e/llama/generation.py#L402)。これは、パターンを自分で準備するよりも堅牢です。トークンの接着など、デバッグが非常に難しい落とし穴を回避できます。このモデルまたは他のモデルに必要な CPU および GPU メモリの量を確認するには、その値を決定するのに役立つ [この計算ツール](https://huggingface.co/spaces/hf-accelerate/model-memory-usage) を試してください。
+
+LLaMA トークナイザーは、[sentencepiece](https://github.com/google/sentencepiece) に基づく BPE モデルです。センテンスピースの癖の 1 つは、シーケンスをデコードするときに、最初のトークンが単語の先頭 (例: 「Banana」) である場合、トークナイザーは文字列の先頭にプレフィックス スペースを追加しないことです。
+
+<Tip>
+
+コード Llama は、`Llama2` モデルと同じアーキテクチャを持っています。API リファレンスについては、[Llama2 のドキュメント ページ](llama2) を参照してください。
+以下の Code Llama トークナイザーのリファレンスを見つけてください。
+</Tip>
+
+## CodeLlamaTokenizer
+
+[[autodoc]] CodeLlamaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## CodeLlamaTokenizerFast
+
+[[autodoc]] CodeLlamaTokenizerFast
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - update_post_processor
+    - save_vocabulary
diff --git a/docs/source/ja/model_doc/codegen.md b/docs/source/ja/model_doc/codegen.md
new file mode 100644
index 00000000000000..78caefe043319b
--- /dev/null
+++ b/docs/source/ja/model_doc/codegen.md
@@ -0,0 +1,90 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CodeGen
+
+## Overview
+
+
+CodeGen モデルは、[A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) で Erik Nijkamp、Bo Pang、林宏明、Lifu Tu、Huan Wang、Yingbo Zhou、Silvio Savarese、Caiming Xiong およびカイミン・ションさん。
+
+CodeGen は、[The Pile](https://pile.eleuther.ai/)、BigQuery、BigPython で順次トレーニングされたプログラム合成用の自己回帰言語モデルです。
+
+論文の要約は次のとおりです。
+
+*プログラム合成は、与えられた問題仕様の解決策としてコンピューター プログラムを生成することを目的としています。我々は、大規模な言語モデルを介した会話型プログラム合成アプローチを提案します。これは、従来のアプローチで直面した広大なプログラム空間とユーザーの意図の仕様を検索するという課題に対処します。私たちの新しいアプローチでは、仕様とプログラムを作成するプロセスを、ユーザーとシステムの間の複数回の対話として捉えます。これはプログラム合成をシーケンス予測問題として扱い、仕様が自然言語で表現され、目的のプログラムが条件付きでサンプリングされます。私たちは、自然言語とプログラミング言語のデータに基づいて、CodeGen と呼ばれる大規模な言語モデルのファミリーをトレーニングします。データの監視が弱く、データ サイズとモデル サイズが拡大すると、単純な自己回帰言語モデリングから会話能力が生まれます。会話型プログラム合成におけるモデルの動作を研究するために、マルチターン プログラミング ベンチマーク (MTPB) を開発します。このベンチマークでは、各問題を解決するには、ユーザーとモデル間のマルチターン会話を介したマルチステップ合成が必要です。私たちの調査結果は、会話機能の出現と、提案されている会話プログラム合成パラダイムの有効性を示しています。さらに、私たちのモデル CodeGen (TPU-v4 でトレーニングされた最大 16B パラメーターを含む) は、HumanEval ベンチマークで OpenAI の Codex を上回ります。私たちはチェックポイントを含むトレーニング ライブラリ JaxFormer をオープン ソースのコントリビューションとして利用できるようにしています: [この https URL](https://github.com/salesforce/codegen)*。
+
+このモデルは [林 宏明](https://huggingface.co/rooa) によって寄稿されました。
+元のコードは [ここ](https://github.com/salesforce/codegen) にあります。
+
+## Checkpoint Naming
+
+* CodeGen モデル [チェックポイント](https://huggingface.co/models?other=codegen) は、可変サイズのさまざまな事前トレーニング データで利用できます。
+* 形式は「Salesforce/codegen-{size}-{data}」です。ここで、
+  * `size`: `350M`、`2B`、`6B`、`16B`
+  * `data`:
+    * `nl`: パイルで事前トレーニング済み
+    * `multi`: `nl` で初期化され、複数のプログラミング言語データでさらに事前トレーニングされます。
+    * `mono`: `multi` で初期化され、Python データでさらに事前トレーニングされます。
+* たとえば、`Salesforce/codegen-350M-mono` は、Pile、複数のプログラミング言語、および Python で順次事前トレーニングされた 3 億 5,000 万のパラメーターのチェックポイントを提供します。
+
+## Usage example
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+>>> checkpoint = "Salesforce/codegen-350M-mono"
+>>> model = AutoModelForCausalLM.from_pretrained(checkpoint)
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+
+>>> text = "def hello_world():"
+
+>>> completion = model.generate(**tokenizer(text, return_tensors="pt"))
+
+>>> print(tokenizer.decode(completion[0]))
+def hello_world():
+    print("Hello World")
+
+hello_world()
+```
+
+## Resources
+
+- [因果言語モデリング タスク ガイド](../tasks/language_modeling)
+
+## CodeGenConfig
+
+[[autodoc]] CodeGenConfig
+    - all
+
+## CodeGenTokenizer
+
+[[autodoc]] CodeGenTokenizer
+    - save_vocabulary
+
+## CodeGenTokenizerFast
+
+[[autodoc]] CodeGenTokenizerFast
+
+## CodeGenModel
+
+[[autodoc]] CodeGenModel
+    - forward
+
+## CodeGenForCausalLM
+
+[[autodoc]] CodeGenForCausalLM
+    - forward
diff --git a/docs/source/ja/model_doc/conditional_detr.md b/docs/source/ja/model_doc/conditional_detr.md
new file mode 100644
index 00000000000000..4ef09f0b6d898e
--- /dev/null
+++ b/docs/source/ja/model_doc/conditional_detr.md
@@ -0,0 +1,75 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Conditional DETR
+
+## Overview
+
+条件付き DETR モデルは、[Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) で Depu Meng、Xiaokang Chen、Zejia Fan、Gang Zeng、Houqiang Li、Yuhui Yuan、Lei Sun,  Jingdong Wang によって提案されました。王京東。条件付き DETR は、高速 DETR トレーニングのための条件付きクロスアテンション メカニズムを提供します。条件付き DETR は DETR よりも 6.7 倍から 10 倍速く収束します。
+
+論文の要約は次のとおりです。
+
+*最近開発された DETR アプローチは、トランスフォーマー エンコーダーおよびデコーダー アーキテクチャを物体検出に適用し、有望なパフォーマンスを実現します。この論文では、トレーニングの収束が遅いという重要な問題を扱い、高速 DETR トレーニングのための条件付きクロスアテンション メカニズムを紹介します。私たちのアプローチは、DETR におけるクロスアテンションが 4 つの四肢の位置特定とボックスの予測にコンテンツの埋め込みに大きく依存しているため、高品質のコンテンツの埋め込みの必要性が高まり、トレーニングの難易度が高くなるという点に動機づけられています。条件付き DETR と呼ばれる私たちのアプローチは、デコーダーのマルチヘッド クロスアテンションのためにデコーダーの埋め込みから条件付きの空間クエリを学習します。利点は、条件付き空間クエリを通じて、各クロスアテンション ヘッドが、個別の領域 (たとえば、1 つのオブジェクトの端またはオブジェクト ボックス内の領域) を含むバンドに注目できることです。これにより、オブジェクト分類とボックス回帰のための個別の領域をローカライズするための空間範囲が狭まり、コンテンツの埋め込みへの依存が緩和され、トレーニングが容易になります。実験結果は、条件付き DETR がバックボーン R50 および R101 で 6.7 倍速く収束し、より強力なバックボーン DC5-R50 および DC5-R101 で 10 倍速く収束することを示しています。コードは https://github.com/Atten4Vis/ConditionalDETR で入手できます。*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/conditional_detr_curve.jpg"
+alt="描画" width="600"/>
+
+<small> 条件付き DETR は、元の DETR に比べてはるかに速い収束を示します。 <a href="https://arxiv.org/abs/2108.06152">元の論文</a>から引用。</small>
+
+このモデルは [DepuMeng](https://huggingface.co/DepuMeng) によって寄稿されました。元のコードは [ここ](https://github.com/Atten4Vis/ConditionalDETR) にあります。
+
+## Resources
+
+- [オブジェクト検出タスクガイド](../tasks/object_detection)
+
+## ConditionalDetrConfig
+
+[[autodoc]] ConditionalDetrConfig
+
+## ConditionalDetrImageProcessor
+
+[[autodoc]] ConditionalDetrImageProcessor
+    - preprocess
+    - post_process_object_detection
+    - post_process_instance_segmentation
+    - post_process_semantic_segmentation
+    - post_process_panoptic_segmentation
+
+## ConditionalDetrFeatureExtractor
+
+[[autodoc]] ConditionalDetrFeatureExtractor
+    - __call__
+    - post_process_object_detection
+    - post_process_instance_segmentation
+    - post_process_semantic_segmentation
+    - post_process_panoptic_segmentation
+
+## ConditionalDetrModel
+
+[[autodoc]] ConditionalDetrModel
+    - forward
+
+## ConditionalDetrForObjectDetection
+
+[[autodoc]] ConditionalDetrForObjectDetection
+    - forward
+
+## ConditionalDetrForSegmentation
+
+[[autodoc]] ConditionalDetrForSegmentation
+    - forward
+
+    
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/convbert.md b/docs/source/ja/model_doc/convbert.md
new file mode 100644
index 00000000000000..5d15f86c5136b7
--- /dev/null
+++ b/docs/source/ja/model_doc/convbert.md
@@ -0,0 +1,145 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvBERT
+
+<div class="flex flex-wrap space-x-1">
+<a href="https://huggingface.co/models?filter=convbert">
+<img alt="Models" src="https://img.shields.io/badge/All_model_pages-convbert-blueviolet">
+</a>
+<a href="https://huggingface.co/spaces/docs-demos/conv-bert-base">
+<img alt="Spaces" src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue">
+</a>
+</div>
+
+## Overview
+
+ConvBERT モデルは、[ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) で Zihang Jiang、Weihao Yu、Daquan Zhou、Yunpeng Chen、Jiashi Feng、Shuicheng Yan によって提案されました。
+やん。
+
+論文の要約は次のとおりです。
+
+*BERT やそのバリアントなどの事前トレーニング済み言語モデルは、最近、さまざまな環境で目覚ましいパフォーマンスを達成しています。
+自然言語理解タスク。ただし、BERT はグローバルな自己注意ブロックに大きく依存しているため、問題が発生します。
+メモリ使用量と計算コストが大きくなります。すべての注意が入力シーケンス全体に対してクエリを実行しますが、
+グローバルな観点からアテンション マップを生成すると、一部のヘッドはローカルな依存関係のみを学習する必要があることがわかります。
+これは、計算の冗長性が存在することを意味します。したがって、我々は、新しいスパンベースの動的畳み込みを提案します。
+これらのセルフアテンション ヘッドを置き換えて、ローカルの依存関係を直接モデル化します。新しいコンボリューションヘッドと、
+自己注意の頭を休め、グローバルとローカルの両方の状況でより効率的な新しい混合注意ブロックを形成します
+学ぶ。この混合注意設計を BERT に装備し、ConvBERT モデルを構築します。実験でわかったことは、
+ConvBERT は、トレーニング コストが低く、さまざまな下流タスクにおいて BERT およびその亜種よりも大幅に優れたパフォーマンスを発揮します。
+モデルパラメータが少なくなります。注目すべきことに、ConvBERTbase モデルは 86.4 GLUE スコアを達成し、ELECTRAbase よりも 0.7 高いのに対し、
+トレーニングコストは 1/4 未満です。コードと事前トレーニングされたモデルがリリースされます。*
+
+このモデルは、[abhishek](https://huggingface.co/abhishek) によって提供されました。オリジナルの実装が見つかります
+ここ: https://github.com/yitu-opensource/ConvBert
+
+## Usage tips
+
+ConvBERT トレーニングのヒントは BERT のヒントと似ています。使用上のヒントについては、[BERT ドキュメント](bert) を参照してください。
+
+## Resources
+
+- [テキスト分類タスクガイド](../tasks/sequence_classification)
+- [トークン分類タスクガイド](../tasks/token_classification)
+- [質問回答タスク ガイド](../tasks/question_answering)
+- [マスクされた言語モデリング タスク ガイド](../tasks/masked_lang_modeling)
+- [多肢選択タスク ガイド](../tasks/multiple_choice)
+
+## ConvBertConfig
+
+[[autodoc]] ConvBertConfig
+
+## ConvBertTokenizer
+
+[[autodoc]] ConvBertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## ConvBertTokenizerFast
+
+[[autodoc]] ConvBertTokenizerFast
+
+<frameworkcontent>
+<pt>
+
+## ConvBertModel
+
+[[autodoc]] ConvBertModel
+    - forward
+
+## ConvBertForMaskedLM
+
+[[autodoc]] ConvBertForMaskedLM
+    - forward
+
+## ConvBertForSequenceClassification
+
+[[autodoc]] ConvBertForSequenceClassification
+    - forward
+
+## ConvBertForMultipleChoice
+
+[[autodoc]] ConvBertForMultipleChoice
+    - forward
+
+## ConvBertForTokenClassification
+
+[[autodoc]] ConvBertForTokenClassification
+    - forward
+
+## ConvBertForQuestionAnswering
+
+[[autodoc]] ConvBertForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFConvBertModel
+
+[[autodoc]] TFConvBertModel
+    - call
+
+## TFConvBertForMaskedLM
+
+[[autodoc]] TFConvBertForMaskedLM
+    - call
+
+## TFConvBertForSequenceClassification
+
+[[autodoc]] TFConvBertForSequenceClassification
+    - call
+
+## TFConvBertForMultipleChoice
+
+[[autodoc]] TFConvBertForMultipleChoice
+    - call
+
+## TFConvBertForTokenClassification
+
+[[autodoc]] TFConvBertForTokenClassification
+    - call
+
+## TFConvBertForQuestionAnswering
+
+[[autodoc]] TFConvBertForQuestionAnswering
+    - call
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/model_doc/convnext.md b/docs/source/ja/model_doc/convnext.md
new file mode 100644
index 00000000000000..4386a7df8ceadb
--- /dev/null
+++ b/docs/source/ja/model_doc/convnext.md
@@ -0,0 +1,94 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvNeXT
+
+## Overview
+
+ConvNeXT モデルは、[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) で Zhuang Liu、Hanzi Mao、Chao-Yuan Wu、Christoph Feichtenhofer、Trevor Darrell、Saining Xie によって提案されました。
+ConvNeXT は、ビジョン トランスフォーマーの設計からインスピレーションを得た純粋な畳み込みモデル (ConvNet) であり、ビジョン トランスフォーマーよりも優れたパフォーマンスを発揮すると主張しています。
+
+論文の要約は次のとおりです。
+
+*視覚認識の「狂騒の 20 年代」は、最先端の画像分類モデルとして ConvNet にすぐに取って代わられた Vision Transformers (ViT) の導入から始まりました。
+一方、バニラ ViT は、オブジェクト検出やセマンティック セグメンテーションなどの一般的なコンピューター ビジョン タスクに適用すると困難に直面します。階層型トランスフォーマーです
+(Swin Transformers など) は、いくつかの ConvNet の以前の機能を再導入し、Transformers を汎用ビジョン バックボーンとして実用的に可能にし、幅広い環境で顕著なパフォーマンスを実証しました。
+さまざまな視覚タスク。ただし、このようなハイブリッド アプローチの有効性は、依然として、固有の誘導性ではなく、トランスフォーマーの本質的な優位性によるところが大きいと考えられています。
+畳み込みのバイアス。この作業では、設計空間を再検討し、純粋な ConvNet が達成できる限界をテストします。標準 ResNet を設計に向けて徐々に「最新化」します。
+ビジョン Transformer の概要を確認し、途中でパフォーマンスの違いに寄与するいくつかの重要なコンポーネントを発見します。この調査の結果は、純粋な ConvNet モデルのファミリーです。
+ConvNextと呼ばれます。 ConvNeXts は完全に標準の ConvNet モジュールから構築されており、精度と拡張性の点で Transformers と有利に競合し、87.8% の ImageNet トップ 1 精度を達成しています。
+標準 ConvNet のシンプルさと効率を維持しながら、COCO 検出と ADE20K セグメンテーションでは Swin Transformers よりも優れたパフォーマンスを発揮します。*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnext_architecture.jpg"
+alt="描画" width="600"/>
+
+<small> ConvNeXT アーキテクチャ。 <a href="https://arxiv.org/abs/2201.03545">元の論文</a>から抜粋。</small>
+
+このモデルは、[nielsr](https://huggingface.co/nielsr) によって提供されました。 TensorFlow バージョンのモデルは [ariG23498](https://github.com/ariG23498) によって提供されました。
+[gante](https://github.com/gante)、および [sayakpaul](https://github.com/sayakpaul) (同等の貢献)。元のコードは [こちら](https://github.com/facebookresearch/ConvNeXt) にあります。
+
+## Resources
+
+ConvNeXT の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ConvNextForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
+- 参照: [画像分類タスク ガイド](../tasks/image_classification)
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## ConvNextConfig
+
+[[autodoc]] ConvNextConfig
+
+## ConvNextFeatureExtractor
+
+[[autodoc]] ConvNextFeatureExtractor
+
+## ConvNextImageProcessor
+
+[[autodoc]] ConvNextImageProcessor
+    - preprocess
+
+<frameworkcontent>
+<pt>
+
+## ConvNextModel
+
+[[autodoc]] ConvNextModel
+    - forward
+
+## ConvNextForImageClassification
+
+[[autodoc]] ConvNextForImageClassification
+    - forward
+
+</pt>
+<tf>
+
+## TFConvNextModel
+
+[[autodoc]] TFConvNextModel
+    - call
+
+## TFConvNextForImageClassification
+
+[[autodoc]] TFConvNextForImageClassification
+    - call
+
+</tf>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ja/model_doc/convnextv2.md b/docs/source/ja/model_doc/convnextv2.md
new file mode 100644
index 00000000000000..9e4d54df24b1c1
--- /dev/null
+++ b/docs/source/ja/model_doc/convnextv2.md
@@ -0,0 +1,68 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# ConvNeXt V2
+
+## Overview
+
+ConvNeXt V2 モデルは、Sanghyun Woo、Shobhik Debnath、Ronghang Hu、Xinlei Chen、Zhuang Liu, In So Kweon, Saining Xie. によって [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) で提案されました。
+ConvNeXt V2 は、Vision Transformers の設計からインスピレーションを得た純粋な畳み込みモデル (ConvNet) であり、[ConvNeXT](convnext) の後継です。
+
+論文の要約は次のとおりです。
+
+*アーキテクチャの改善と表現学習フレームワークの改善により、視覚認識の分野は 2020 年代初頭に急速な近代化とパフォーマンスの向上を実現しました。たとえば、ConvNeXt に代表される最新の ConvNet は、さまざまなシナリオで強力なパフォーマンスを実証しています。これらのモデルはもともと ImageNet ラベルを使用した教師あり学習用に設計されましたが、マスク オートエンコーダー (MAE) などの自己教師あり学習手法からも潜在的に恩恵を受けることができます。ただし、これら 2 つのアプローチを単純に組み合わせると、パフォーマンスが標準以下になることがわかりました。この論文では、完全畳み込みマスク オートエンコーダ フレームワークと、チャネル間の機能競合を強化するために ConvNeXt アーキテクチャに追加できる新しい Global Response Normalization (GRN) 層を提案します。この自己教師あり学習手法とアーキテクチャの改善の共同設計により、ConvNeXt V2 と呼ばれる新しいモデル ファミリが誕生しました。これにより、ImageNet 分類、COCO 検出、ADE20K セグメンテーションなどのさまざまな認識ベンチマークにおける純粋な ConvNet のパフォーマンスが大幅に向上します。また、ImageNet でトップ 1 の精度 76.7% を誇る効率的な 370 万パラメータの Atto モデルから、最先端の 88.9% を達成する 650M Huge モデルまで、さまざまなサイズの事前トレーニング済み ConvNeXt V2 モデルも提供しています。公開トレーニング データのみを使用した精度*。
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/convnextv2_architecture.png"
+alt="描画" width="600"/>
+
+<small> ConvNeXt V2 アーキテクチャ。 <a href="https://arxiv.org/abs/2301.00808">元の論文</a>から抜粋。</small>
+
+このモデルは [adirik](https://huggingface.co/adirik) によって提供されました。元のコードは [こちら](https://github.com/facebookresearch/ConvNeXt-V2) にあります。
+
+## Resources
+
+ConvNeXt V2 の使用を開始するのに役立つ公式 Hugging Face およびコミュニティ (🌎 で示される) リソースのリスト。
+
+<PipelineTag pipeline="image-classification"/>
+
+- [`ConvNextV2ForImageClassification`] は、この [サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) および [ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)。
+
+ここに含めるリソースの送信に興味がある場合は、お気軽にプル リクエストを開いてください。審査させていただきます。リソースは、既存のリソースを複製するのではなく、何か新しいものを示すことが理想的です。
+
+## ConvNextV2Config
+
+[[autodoc]] ConvNextV2Config
+
+## ConvNextV2Model
+
+[[autodoc]] ConvNextV2Model
+    - forward
+
+## ConvNextV2ForImageClassification
+
+[[autodoc]] ConvNextV2ForImageClassification
+    - forward
+
+## TFConvNextV2Model
+
+[[autodoc]] TFConvNextV2Model
+    - call
+
+
+## TFConvNextV2ForImageClassification
+
+[[autodoc]] TFConvNextV2ForImageClassification
+    - call
diff --git a/docs/source/ja/model_doc/cpm.md b/docs/source/ja/model_doc/cpm.md
new file mode 100644
index 00000000000000..9776f676844e7d
--- /dev/null
+++ b/docs/source/ja/model_doc/cpm.md
@@ -0,0 +1,54 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# CPM
+
+## Overview
+
+CPM モデルは、Zhengyan Zhang、Xu Han、Hao Zhou、Pei Ke、Yuxian Gu によって [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) で提案されました。葉徳明、秦裕佳、
+Yusheng Su、Haozhe Ji、Jian Guan、Fanchao Qi、Xiaozi Wang、Yanan Zheng、Guoyang Zeng、Huanqi Cao、Shengqi Chen、
+Daixuan Li、Zhenbo Sun、Zhiyuan Liu、Minlie Huang、Wentao Han、Jie Tang、Juanzi Li、Xiaoyan Zhu、Maosong Sun。
+
+論文の要約は次のとおりです。
+
+*事前トレーニングされた言語モデル (PLM) は、さまざまな下流の NLP タスクに有益であることが証明されています。最近ではGPT-3、
+1,750億個のパラメータと570GBの学習データを備え、数回の撮影（1枚でも）の容量で大きな注目を集めました
+ゼロショット）学習。ただし、GPT-3 を適用して中国語の NLP タスクに対処することは依然として困難です。
+GPT-3 の言語は主に英語であり、パラメーターは公開されていません。この技術レポートでは、
+大規模な中国語トレーニング データに対する生成的事前トレーニングを備えた中国語事前トレーニング済み言語モデル (CPM)。最高に
+私たちの知識の限りでは、26 億のパラメータと 100GB の中国語トレーニング データを備えた CPM は、事前トレーニングされた中国語としては最大のものです。
+言語モデルは、会話、エッセイの作成、
+クローゼテストと言語理解。広範な実験により、CPM が多くの環境で優れたパフォーマンスを達成できることが実証されています。
+少数ショット (ゼロショットでも) 学習の設定での NLP タスク。*
+
+このモデルは [canwenxu](https://huggingface.co/canwenxu) によって提供されました。オリジナルの実装が見つかります
+ここ: https://github.com/TsinghuaAI/CPM-Generate
+
+
+<Tip>
+
+CPM のアーキテクチャは、トークン化方法を除いて GPT-2 と同じです。詳細については、[GPT-2 ドキュメント](gpt2) を参照してください。
+API リファレンス情報。
+
+</Tip>
+
+## CpmTokenizer
+
+[[autodoc]] CpmTokenizer
+
+## CpmTokenizerFast
+
+[[autodoc]] CpmTokenizerFast
diff --git a/docs/source/ja/perf_hardware.md b/docs/source/ja/perf_hardware.md
index b58db6e76d0995..a0db527a94b662 100644
--- a/docs/source/ja/perf_hardware.md
+++ b/docs/source/ja/perf_hardware.md
@@ -139,7 +139,7 @@ NVLinkを使用すると、トレーニングが約23％速く完了すること
 ```bash
 # DDP w/ NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
@@ -148,7 +148,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch
 
 # DDP w/o NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
diff --git a/docs/source/ja/perf_infer_gpu_one.md b/docs/source/ja/perf_infer_gpu_one.md
index d6a18a6f3e2047..6d7466e022220a 100644
--- a/docs/source/ja/perf_infer_gpu_one.md
+++ b/docs/source/ja/perf_infer_gpu_one.md
@@ -44,7 +44,7 @@ Flash Attention 2は、モデルのdtypeが`fp16`または`bf16`の場合にの
 
 ### Quick usage
 
-モデルでFlash Attention 2を有効にするには、`from_pretrained`の引数に`use_flash_attention_2`を追加します。
+モデルでFlash Attention 2を有効にするには、`from_pretrained`の引数に`attn_implementation="flash_attention_2"`を追加します。
 
 
 ```python
@@ -57,7 +57,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     torch_dtype=torch.bfloat16, 
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 ```
 
@@ -114,7 +114,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     load_in_8bit=True,
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 ```
 
@@ -132,7 +132,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     load_in_4bit=True,
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 ```
 
@@ -151,7 +151,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_id)
 model = AutoModelForCausalLM.from_pretrained(
     model_id, 
     load_in_4bit=True,
-    use_flash_attention_2=True,
+    attn_implementation="flash_attention_2",
 )
 
 lora_config = LoraConfig(
diff --git a/docs/source/ja/perf_train_gpu_many.md b/docs/source/ja/perf_train_gpu_many.md
index fd7713c49369f6..71d6c2805865aa 100644
--- a/docs/source/ja/perf_train_gpu_many.md
+++ b/docs/source/ja/perf_train_gpu_many.md
@@ -143,7 +143,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 
 # DDP w/ NVlink
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -151,7 +151,7 @@ python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-
 
 # DDP w/o NVlink
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/ja/preprocessing.md b/docs/source/ja/preprocessing.md
index c4e3566fd3aee4..b8fad2a0d21b36 100644
--- a/docs/source/ja/preprocessing.md
+++ b/docs/source/ja/preprocessing.md
@@ -227,7 +227,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 オーディオタスクの場合、データセットをモデル用に準備するために[特徴抽出器](main_classes/feature_extractor)が必要です。
 特徴抽出器は生のオーディオデータから特徴を抽出し、それらをテンソルに変換するために設計されています。
 
-[PolyAI/minds14](https://huggingface.co/datasets/PolyAI/minds14)データセットをロードして（データセットのロード方法の詳細については🤗 [Datasetsチュートリアル](https://huggingface.co/docs/datasets/load_hub.html)を参照）、
+[PolyAI/minds14](https://huggingface.co/datasets/PolyAI/minds14)データセットをロードして（データセットのロード方法の詳細については🤗 [Datasetsチュートリアル](https://huggingface.co/docs/datasets/load_hub)を参照）、
 オーディオデータセットで特徴抽出器をどのように使用できるかを確認してみましょう：
 
 ```python
@@ -349,7 +349,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 </Tip>
 
-コンピュータビジョンのデータセットで画像プロセッサを使用する方法を示すために、[food101](https://huggingface.co/datasets/food101)データセットをロードします（データセットのロード方法の詳細については🤗[Datasetsチュートリアル](https://huggingface.co/docs/datasets/load_hub.html)を参照）：
+コンピュータビジョンのデータセットで画像プロセッサを使用する方法を示すために、[food101](https://huggingface.co/datasets/food101)データセットをロードします（データセットのロード方法の詳細については🤗[Datasetsチュートリアル](https://huggingface.co/docs/datasets/load_hub)を参照）：
 
 <Tip>
 
@@ -363,7 +363,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-次に、🤗 Datasetsの [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image) 機能で画像を見てみましょう：
+次に、🤗 Datasetsの [`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image) 機能で画像を見てみましょう：
 
 ```python
 >>> dataset[0]["image"]
@@ -419,7 +419,7 @@ AutoImageProcessorを[`AutoImageProcessor.from_pretrained`]を使用してロー
 画像を増強変換の一部として正規化したい場合は、`image_processor.image_mean` と `image_processor.image_std` の値を使用してください。
 </Tip>
 
-3. 次に、🤗 Datasetsの[`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)を使用して、変換をリアルタイムで適用します：
+3. 次に、🤗 Datasetsの[`set_transform`](https://huggingface.co/docs/datasets/process#format-transform)を使用して、変換をリアルタイムで適用します：
 
 ```python
 >>> dataset.set_transform(transforms)
@@ -474,7 +474,7 @@ AutoImageProcessorを[`AutoImageProcessor.from_pretrained`]を使用してロー
 
 マルチモーダル入力を使用するタスクの場合、モデル用にデータセットを準備するための[プロセッサ](main_classes/processors)が必要です。プロセッサは、トークナイザや特徴量抽出器などの2つの処理オブジェクトを結合します。
 
-自動音声認識（ASR）のためのプロセッサの使用方法を示すために、[LJ Speech](https://huggingface.co/datasets/lj_speech)データセットをロードします（データセットのロード方法の詳細については🤗 [Datasets チュートリアル](https://huggingface.co/docs/datasets/load_hub.html)を参照）：
+自動音声認識（ASR）のためのプロセッサの使用方法を示すために、[LJ Speech](https://huggingface.co/datasets/lj_speech)データセットをロードします（データセットのロード方法の詳細については🤗 [Datasets チュートリアル](https://huggingface.co/docs/datasets/load_hub)を参照）：
 
 ```python
 >>> from datasets import load_dataset
diff --git a/docs/source/ja/run_scripts.md b/docs/source/ja/run_scripts.md
index 1fde9afc0c6e5d..a7cc89d1348491 100644
--- a/docs/source/ja/run_scripts.md
+++ b/docs/source/ja/run_scripts.md
@@ -140,7 +140,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 以下は提供されたBashコードです。このコードの日本語訳をMarkdown形式で記載します。
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/ja/tasks/asr.md b/docs/source/ja/tasks/asr.md
new file mode 100644
index 00000000000000..fd564abdc5c908
--- /dev/null
+++ b/docs/source/ja/tasks/asr.md
@@ -0,0 +1,380 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Automatic speech recognition
+
+[[open-in-colab]]
+
+<Youtube id="TksaY_FDgnk"/>
+
+自動音声認識 (ASR) は音声信号をテキストに変換し、一連の音声入力をテキスト出力にマッピングします。 Siri や Alexa などの仮想アシスタントは ASR モデルを使用してユーザーを日常的に支援しており、ライブキャプションや会議中のメモ取りなど、他にも便利なユーザー向けアプリケーションが数多くあります。
+
+このガイドでは、次の方法を説明します。
+
+1. [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) データセットの [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) を微調整して、音声をテキストに書き起こします。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [M-CTC-T](../model_doc/mctct), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate jiwer
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load MInDS-14 dataset
+
+まず、🤗 データセット ライブラリから [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) データセットの小さいサブセットをロードします。これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train[:100]")
+```
+
+[`~Dataset.train_test_split`] メソッドを使用して、データセットの `train` 分割をトレイン セットとテスト セットに分割します。
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+次に、データセットを見てみましょう。
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 16
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 4
+    })
+})
+```
+
+データセットには`lang_id`や`english_transcription`などの多くの有用な情報が含まれていますが、このガイドでは「`audio`」と「`transciption`」に焦点を当てます。 [`~datasets.Dataset.remove_columns`] メソッドを使用して他の列を削除します。
+
+```py
+>>> minds = minds.remove_columns(["english_transcription", "intent_class", "lang_id"])
+```
+
+もう一度例を見てみましょう。
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([-0.00024414,  0.        ,  0.        , ...,  0.00024414,
+          0.00024414,  0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 8000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+次の 2 つのフィールドがあります。
+
+- `audio`: 音声ファイルをロードしてリサンプリングするために呼び出す必要がある音声信号の 1 次元の `array`。
+- `transcription`: ターゲットテキスト。
+
+## Preprocess
+
+次のステップでは、Wav2Vec2 プロセッサをロードしてオーディオ信号を処理します。
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base")
+```
+
+MInDS-14 データセットのサンプリング レートは 8000kHz です (この情報は [データセット カード](https://huggingface.co/datasets/PolyAI/minds14) で確認できます)。つまり、データセットを再サンプリングする必要があります。事前トレーニングされた Wav2Vec2 モデルを使用するには、16000kHz に設定します。
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([-2.38064706e-04, -1.58618059e-04, -5.43987835e-06, ...,
+          2.78103951e-04,  2.38446111e-04,  1.18740834e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+  'sampling_rate': 16000},
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602ba9e2963e11ccd901cd4f.wav',
+ 'transcription': "hi I'm trying to use the banking app on my phone and currently my checking and savings account balance is not refreshing"}
+```
+
+上の `transcription` でわかるように、テキストには大文字と小文字が混在しています。 Wav2Vec2 トークナイザーは大文字のみでトレーニングされるため、テキストがトークナイザーの語彙と一致することを確認する必要があります。
+
+```py
+>>> def uppercase(example):
+...     return {"transcription": example["transcription"].upper()}
+
+
+>>> minds = minds.map(uppercase)
+```
+
+次に、次の前処理関数を作成します。
+
+1. `audio`列を呼び出して、オーディオ ファイルをロードしてリサンプリングします。
+2. オーディオ ファイルから `input_values` を抽出し、プロセッサを使用して `transcription` 列をトークン化します。
+
+```py
+>>> def prepare_dataset(batch):
+...     audio = batch["audio"]
+...     batch = processor(audio["array"], sampling_rate=audio["sampling_rate"], text=batch["transcription"])
+...     batch["input_length"] = len(batch["input_values"][0])
+...     return batch
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `num_proc` パラメータを使用してプロセスの数を増やすことで、`map` を高速化できます。 [`~datasets.Dataset.remove_columns`] メソッドを使用して、不要な列を削除します。 
+
+```py
+>>> encoded_minds = minds.map(prepare_dataset, remove_columns=minds.column_names["train"], num_proc=4)
+```
+
+🤗 Transformers には ASR 用のデータ照合器がないため、[`DataCollat​​orWithPadding`] を調整してサンプルのバッチを作成する必要があります。また、テキストとラベルが (データセット全体ではなく) バッチ内の最も長い要素の長さに合わせて動的に埋め込まれ、均一な長さになります。 `padding=True` を設定すると、`tokenizer` 関数でテキストを埋め込むことができますが、動的な埋め込みの方が効率的です。
+
+他のデータ照合器とは異なり、この特定のデータ照合器は、`input_values`と `labels`」に異なるパディング方法を適用する必要があります。
+
+```py
+>>> import torch
+
+>>> from dataclasses import dataclass, field
+>>> from typing import Any, Dict, List, Optional, Union
+
+
+>>> @dataclass
+... class DataCollatorCTCWithPadding:
+...     processor: AutoProcessor
+...     padding: Union[bool, str] = "longest"
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         # split inputs and labels since they have to be of different lengths and need
+...         # different padding methods
+...         input_features = [{"input_values": feature["input_values"][0]} for feature in features]
+...         label_features = [{"input_ids": feature["labels"]} for feature in features]
+
+...         batch = self.processor.pad(input_features, padding=self.padding, return_tensors="pt")
+
+...         labels_batch = self.processor.pad(labels=label_features, padding=self.padding, return_tensors="pt")
+
+...         # replace padding with -100 to ignore loss correctly
+...         labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
+
+...         batch["labels"] = labels
+
+...         return batch
+```
+
+次に、`DataCollat​​orForCTCWithPadding` をインスタンス化します。
+
+```py
+>>> data_collator = DataCollatorCTCWithPadding(processor=processor, padding="longest")
+```
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[単語エラー率](https://huggingface.co/spaces/evaluate-metric/wer) (WER) メトリクスを読み込みます (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照して、メトリクスをロードして計算する方法の詳細を確認してください)。
+
+```py
+>>> import evaluate
+
+>>> wer = evaluate.load("wer")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して WER を計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(pred):
+...     pred_logits = pred.predictions
+...     pred_ids = np.argmax(pred_logits, axis=-1)
+
+...     pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id
+
+...     pred_str = processor.batch_decode(pred_ids)
+...     label_str = processor.batch_decode(pred.label_ids, group_tokens=False)
+
+...     wer = wer.compute(predictions=pred_str, references=label_str)
+
+...     return {"wer": wer}
+```
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForCTC`] で Wav2Vec2 をロードします。 `ctc_loss_reduction` パラメータで適用する削減を指定します。多くの場合、デフォルトの合計ではなく平均を使用する方が適切です。
+
+```py
+>>> from transformers import AutoModelForCTC, TrainingArguments, Trainer
+
+>>> model = AutoModelForCTC.from_pretrained(
+...     "facebook/wav2vec2-base",
+...     ctc_loss_reduction="mean",
+...     pad_token_id=processor.tokenizer.pad_token_id,
+... )
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`トレーナー`] は WER を評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_asr_mind_model",
+...     per_device_train_batch_size=8,
+...     gradient_accumulation_steps=2,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=2000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     group_by_length=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=8,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="wer",
+...     greater_is_better=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=processor,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+自動音声認識用にモデルを微調整する方法のより詳細な例については、英語 ASR および英語のこのブログ [投稿](https://huggingface.co/blog/fine-tune-wav2vec2-english) を参照してください。多言語 ASR については、この [投稿](https://huggingface.co/blog/fine-tune-xlsr-wav2vec2) を参照してください。
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論を実行したい音声ファイルをロードします。必要に応じて、オーディオ ファイルのサンプリング レートをモデルのサンプリング レートと一致するようにリサンプリングすることを忘れないでください。
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", "en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して自動音声認識用の`pipeline`をインスタンス化し、オーディオ ファイルをそれに渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model")
+>>> transcriber(audio_file)
+{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'}
+```
+
+<Tip>
+
+転写はまあまあですが、もっと良くなる可能性があります。さらに良い結果を得るには、より多くの例でモデルを微調整してみてください。
+
+</Tip>
+
+必要に応じて、「パイプライン」の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+
+プロセッサをロードしてオーディオ ファイルと文字起こしを前処理し、`input`を PyTorch テンソルとして返します。
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+Pass your inputs to the model and return the logits:
+
+```py
+>>> from transformers import AutoModelForCTC
+
+>>> model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+最も高い確率で予測された `input_ids` を取得し、プロセッサを使用して予測された `input_ids` をデコードしてテキストに戻します。
+
+
+```py
+>>> import torch
+
+>>> predicted_ids = torch.argmax(logits, dim=-1)
+>>> transcription = processor.batch_decode(predicted_ids)
+>>> transcription
+['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER']
+```
+
+</pt>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/audio_classification.md b/docs/source/ja/tasks/audio_classification.md
new file mode 100644
index 00000000000000..58d42f3f4d4ff1
--- /dev/null
+++ b/docs/source/ja/tasks/audio_classification.md
@@ -0,0 +1,330 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Audio classification
+
+[[open-in-colab]]
+
+<Youtube id="KWwzcmG98Ds"/>
+
+
+音声分類では、テキストと同様に、入力データから出力されたクラス ラベルを割り当てます。唯一の違いは、テキスト入力の代わりに生のオーディオ波形があることです。音声分類の実際的な応用例には、話者の意図、言語分類、さらには音による動物の種類の識別などがあります。
+
+このガイドでは、次の方法を説明します。
+
+1. [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) データセットで [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) を微調整して話者の意図を分類します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Audio Spectrogram Transformer](../model_doc/audio-spectrogram-transformer), [Data2VecAudio](../model_doc/data2vec-audio), [Hubert](../model_doc/hubert), [SEW](../model_doc/sew), [SEW-D](../model_doc/sew-d), [UniSpeech](../model_doc/unispeech), [UniSpeechSat](../model_doc/unispeech-sat), [Wav2Vec2](../model_doc/wav2vec2), [Wav2Vec2-Conformer](../model_doc/wav2vec2-conformer), [WavLM](../model_doc/wavlm), [Whisper](../model_doc/whisper)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load MInDS-14 dataset
+
+まず、🤗 データセット ライブラリから MInDS-14 データセットをロードします。
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train` をより小さなトレインとテスト セットに分割します。これにより、完全なデータセットにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+次に、データセットを見てみましょう。
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
+    })
+})
+```
+
+
+データセットには`lang_id`や`english_transcription`などの多くの有用な情報が含まれていますが、このガイドでは`audio`と`intent_class`に焦点を当てます。 [`~datasets.Dataset.remove_columns`] メソッドを使用して他の列を削除します。
+
+```py
+>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
+```
+
+ここで例を見てみましょう。
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
+         -0.00024414, -0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 8000},
+ 'intent_class': 2}
+```
+
+次の 2 つのフィールドがあります。
+
+- `audio`: 音声ファイルをロードしてリサンプリングするために呼び出す必要がある音声信号の 1 次元の `array`。
+- `intent_class`: スピーカーのインテントのクラス ID を表します。
+
+モデルがラベル ID からラベル名を取得しやすくするために、ラベル名を整数に、またはその逆にマップする辞書を作成します。
+
+```py
+>>> labels = minds["train"].features["intent_class"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+これで、ラベル ID をラベル名に変換できるようになりました。
+
+```py
+>>> id2label[str(2)]
+'app_error'
+```
+
+## Preprocess
+
+次のステップでは、Wav2Vec2 特徴抽出プログラムをロードしてオーディオ信号を処理します。
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+MInDS-14 データセットのサンプリング レートは 8000khz です (この情報は [データセット カード](https://huggingface.co/datasets/PolyAI/minds14) で確認できます)。つまり、データセットを再サンプリングする必要があります。事前トレーニングされた Wav2Vec2 モデルを使用するには、16000kHz に設定します。
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
+         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 16000},
+ 'intent_class': 2}
+```
+
+次に、次の前処理関数を作成します。
+
+1. `audio`列を呼び出してロードし、必要に応じてオーディオ ファイルをリサンプリングします。
+2. オーディオ ファイルのサンプリング レートが、モデルが事前トレーニングされたオーディオ データのサンプリング レートと一致するかどうかを確認します。この情報は、Wav2Vec2 [モデル カード](https://huggingface.co/facebook/wav2vec2-base) で見つけることができます。
+3. 入力の最大長を設定して、長い入力を切り捨てずにバッチ処理します。
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
+...     )
+...     return inputs
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` を高速化できます。不要な列を削除し、`intent_class` の名前を `label` に変更します。これはモデルが期待する名前であるためです。
+
+```py
+>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
+>>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
+```
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) メトリクスを読み込みます (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してください) メトリクスの読み込みと計算方法の詳細については、次を参照してください。
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して精度を計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions = np.argmax(eval_pred.predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForAudioClassification`] を使用して、予期されるラベルの数とラベル マッピングを使用して Wav2Vec2 を読み込みます。
+
+```py
+>>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+
+>>> num_labels = len(id2label)
+>>> model = AutoModelForAudioClassification.from_pretrained(
+...     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
+... )
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`トレーナー`] は精度を評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_mind_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=3e-5,
+...     per_device_train_batch_size=32,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=32,
+...     num_train_epochs=10,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     tokenizer=feature_extractor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+音声分類用のモデルを微調整する方法の詳細な例については、対応する [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論を実行したい音声ファイルをロードします。必要に応じて、オーディオ ファイルのサンプリング レートをモデルのサンプリング レートと一致するようにリサンプリングすることを忘れないでください。
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して音声分類用の`pipeline`をインスタンス化し、それに音声ファイルを渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")
+>>> classifier(audio_file)
+[
+    {'score': 0.09766869246959686, 'label': 'cash_deposit'},
+    {'score': 0.07998877018690109, 'label': 'app_error'},
+    {'score': 0.0781070664525032, 'label': 'joint_account'},
+    {'score': 0.07667109370231628, 'label': 'pay_bill'},
+    {'score': 0.0755252093076706, 'label': 'balance'}
+]
+```
+
+必要に応じて、`pipeline` の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+
+特徴抽出器をロードしてオーディオ ファイルを前処理し、`input`を PyTorch テンソルとして返します。
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+入力をモデルに渡し、ロジットを返します。
+
+```py
+>>> from transformers import AutoModelForAudioClassification
+
+>>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+最も高い確率でクラスを取得し、モデルの `id2label` マッピングを使用してそれをラベルに変換します。
+
+```py
+>>> import torch
+
+>>> predicted_class_ids = torch.argmax(logits).item()
+>>> predicted_label = model.config.id2label[predicted_class_ids]
+>>> predicted_label
+'cash_deposit'
+```
+</pt>
+</frameworkcontent>
\ No newline at end of file
diff --git a/docs/source/ja/tasks/document_question_answering.md b/docs/source/ja/tasks/document_question_answering.md
new file mode 100644
index 00000000000000..478c6af2235490
--- /dev/null
+++ b/docs/source/ja/tasks/document_question_answering.md
@@ -0,0 +1,502 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Document Question Answering
+
+[[open-in-colab]]
+
+文書による質問応答は、文書による視覚的な質問応答とも呼ばれ、以下を提供するタスクです。
+ドキュメント画像に関する質問への回答。このタスクをサポートするモデルへの入力は通常、画像と画像の組み合わせです。
+質問があり、出力は自然言語で表現された回答です。これらのモデルは、以下を含む複数のモダリティを利用します。
+テキスト、単語の位置 (境界ボックス)、および画像自体。
+
+このガイドでは、次の方法を説明します。
+
+- [DocVQA データセット](https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut) の [LayoutLMv2](../model_doc/layoutlmv2) を微調整します。
+- 微調整されたモデルを推論に使用します。
+
+<Tip>
+
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+LayoutLMv2 は、最後の非表示のヘッダーの上に質問応答ヘッドを追加することで、ドキュメントの質問応答タスクを解決します。
+トークンの状態を調べて、トークンの開始トークンと終了トークンの位置を予測します。
+答え。言い換えれば、問題は抽出的質問応答として扱われます。つまり、コンテキストを考慮して、どの部分を抽出するかということです。
+の情報が質問に答えます。コンテキストは OCR エンジンの出力から取得されます。ここでは Google の Tesseract です。
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。 LayoutLMv2 は detectron2、torchvision、tesseract に依存します。
+
+```bash
+pip install -q transformers datasets
+```
+
+```bash
+pip install 'git+https://github.com/facebookresearch/detectron2.git'
+pip install torchvision
+```
+
+```bash
+sudo apt install tesseract-ocr
+pip install -q pytesseract
+```
+
+すべての依存関係をインストールしたら、ランタイムを再起動します。
+
+モデルをコミュニティと共有することをお勧めします。 Hugging Face アカウントにログインして、🤗 ハブにアップロードします。
+プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+いくつかのグローバル変数を定義しましょう。
+
+```py
+>>> model_checkpoint = "microsoft/layoutlmv2-base-uncased"
+>>> batch_size = 4
+```
+
+## Load the data
+
+このガイドでは、🤗 Hub にある前処理された DocVQA の小さなサンプルを使用します。フルに使いたい場合は、
+DocVQA データセットは、[DocVQA ホームページ](https://rrc.cvc.uab.es/?ch=17) で登録してダウンロードできます。そうすれば、
+このガイドを進めて、[🤗 データセットにファイルをロードする方法](https://huggingface.co/docs/datasets/loading#local-and-remote-files) を確認してください。
+
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("nielsr/docvqa_1200_examples")
+>>> dataset
+DatasetDict({
+    train: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['id', 'image', 'query', 'answers', 'words', 'bounding_boxes', 'answer'],
+        num_rows: 200
+    })
+})
+```
+
+ご覧のとおり、データセットはすでにトレーニング セットとテスト セットに分割されています。理解するためにランダムな例を見てみましょう
+機能を備えた自分自身。
+
+```py
+>>> dataset["train"].features
+```
+
+個々のフィールドが表す内容は次のとおりです。
+* `id`: サンプルのID
+* `image`: ドキュメント画像を含む PIL.Image.Image オブジェクト
+* `query`: 質問文字列 - いくつかの言語での自然言語による質問
+* `answers`: ヒューマン アノテーターによって提供された正解のリスト
+* `words` と `bounding_boxes`: OCR の結果。ここでは使用しません。
+* `answer`: 別のモデルと一致する答え。ここでは使用しません。
+
+英語の質問だけを残し、別のモデルによる予測が含まれていると思われる`answer`機能を削除しましょう。
+また、アノテーターによって提供されたセットから最初の回答を取得します。あるいは、ランダムにサンプリングすることもできます。
+
+```py
+>>> updated_dataset = dataset.map(lambda example: {"question": example["query"]["en"]}, remove_columns=["query"])
+>>> updated_dataset = updated_dataset.map(
+...     lambda example: {"answer": example["answers"][0]}, remove_columns=["answer", "answers"]
+... )
+```
+
+このガイドで使用する LayoutLMv2 チェックポイントは、`max_position_embeddings = 512` でトレーニングされていることに注意してください (
+この情報は、[チェックポイントの `config.json` ファイル](https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18)) で見つけてください。
+例を省略することもできますが、答えが大きな文書の最後にあり、結局省略されてしまうという状況を避けるために、
+ここでは、埋め込みが 512 を超える可能性があるいくつかの例を削除します。
+データセット内のほとんどのドキュメントが長い場合は、スライディング ウィンドウ戦略を実装できます。詳細については、[このノートブック](https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb) を確認してください。 。
+
+```py
+>>> updated_dataset = updated_dataset.filter(lambda x: len(x["words"]) + len(x["question"].split()) < 512)
+```
+
+この時点で、このデータセットから OCR 機能も削除しましょう。これらは、異なるデータを微調整するための OCR の結果です。
+モデル。これらは入力要件と一致しないため、使用したい場合はさらに処理が必要になります。
+このガイドで使用するモデルの。代わりに、OCR と OCR の両方の元のデータに対して [`LayoutLMv2Processor`] を使用できます。
+トークン化。このようにして、モデルの予想される入力と一致する入力を取得します。画像を手動で加工したい場合は、
+モデルがどのような入力形式を想定しているかを知るには、[`LayoutLMv2` モデルのドキュメント](../model_doc/layoutlmv2) を確認してください。
+
+```py
+>>> updated_dataset = updated_dataset.remove_columns("words")
+>>> updated_dataset = updated_dataset.remove_columns("bounding_boxes")
+```
+
+最後に、画像サンプルを確認しないとデータ探索は完了しません。
+
+
+```py
+>>> updated_dataset["train"][11]["image"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"/>
+ </div>
+
+## Preprocess the data
+
+文書の質問に答えるタスクはマルチモーダル タスクであるため、各モダリティからの入力が確実に行われるようにする必要があります。
+モデルの期待に従って前処理されます。まず、[`LayoutLMv2Processor`] をロードします。これは、画像データを処理できる画像プロセッサとテキスト データをエンコードできるトークナイザーを内部で組み合わせています。
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(model_checkpoint)
+```
+
+### Preprocessing document images
+
+まず、プロセッサからの `image_processor` を利用して、モデルのドキュメント画像を準備しましょう。
+デフォルトでは、画像プロセッサは画像のサイズを 224x224 に変更し、カラー チャネルの順序が正しいことを確認します。
+tesseract を使用して OCR を適用し、単語と正規化された境界ボックスを取得します。このチュートリアルでは、これらのデフォルトはすべて、まさに必要なものです。
+デフォルトの画像処理を画像のバッチに適用し、OCR の結果を返す関数を作成します。
+
+```py
+>>> image_processor = processor.image_processor
+
+
+>>> def get_ocr_words_and_boxes(examples):
+...     images = [image.convert("RGB") for image in examples["image"]]
+...     encoded_inputs = image_processor(images)
+
+...     examples["image"] = encoded_inputs.pixel_values
+...     examples["words"] = encoded_inputs.words
+...     examples["boxes"] = encoded_inputs.boxes
+
+...     return examples
+```
+
+この前処理をデータセット全体に高速に適用するには、[`~datasets.Dataset.map`] を使用します。
+
+```py
+>>> dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2)
+```
+
+### Preprocessing text data
+
+画像に OCR を適用したら、データセットのテキスト部分をエンコードしてモデル用に準備する必要があります。
+これには、前のステップで取得した単語とボックスをトークンレベルの `input_ids`、`attention_mask`、
+`token_type_ids`と`bbox`。テキストを前処理するには、プロセッサからの`Tokenizer`が必要になります。
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+前述の前処理に加えて、モデルのラベルを追加する必要もあります。 `xxxForQuestionAnswering` モデルの場合
+🤗 Transformers では、ラベルは `start_positions` と `end_positions` で構成され、どのトークンがその位置にあるかを示します。
+開始点と、どのトークンが回答の最後にあるか。
+
+それから始めましょう。より大きなリスト (単語リスト) 内のサブリスト (単語に分割された回答) を検索できるヘルパー関数を定義します。
+
+この関数は、`words_list` と `answer_list` という 2 つのリストを入力として受け取ります。次に、`words_list`を反復処理してチェックします。
+`words_list` (words_list[i]) 内の現在の単語が、answer_list (answer_list[0]) の最初の単語と等しいかどうか、および
+現在の単語から始まり、`answer_list` と同じ長さの `words_list` のサブリストは、`to answer_list` と等しくなります。
+この条件が true の場合、一致が見つかったことを意味し、関数は一致とその開始インデックス (idx) を記録します。
+とその終了インデックス (idx + len(answer_list) - 1)。複数の一致が見つかった場合、関数は最初のもののみを返します。
+一致するものが見つからない場合、関数は (`None`、0、および 0) を返します。
+
+```py
+>>> def subfinder(words_list, answer_list):
+...     matches = []
+...     start_indices = []
+...     end_indices = []
+...     for idx, i in enumerate(range(len(words_list))):
+...         if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list:
+...             matches.append(answer_list)
+...             start_indices.append(idx)
+...             end_indices.append(idx + len(answer_list) - 1)
+...     if matches:
+...         return matches[0], start_indices[0], end_indices[0]
+...     else:
+...         return None, 0, 0
+```
+
+この関数が答えの位置を見つける方法を説明するために、例で使用してみましょう。
+
+```py
+>>> example = dataset_with_ocr["train"][1]
+>>> words = [word.lower() for word in example["words"]]
+>>> match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split())
+>>> print("Question: ", example["question"])
+>>> print("Words:", words)
+>>> print("Answer: ", example["answer"])
+>>> print("start_index", word_idx_start)
+>>> print("end_index", word_idx_end)
+Question:  Who is in  cc in this letter?
+Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r.', 'h.', 'honeycutt', 'ce:', 't.f.', 'riehl', 'from:', '.', 'c.j.', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove!', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell.', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before.', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods.', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look.', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas.', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma.', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes.', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions.', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne.', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette.', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette.', '*cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking.', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm.', '«extremely', 'fast', 'buming', 'cigarette.', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics.', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight.', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g.', 'starch', '*colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g.', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes.', '499150498']
+Answer:  T.F. Riehl
+start_index 17
+end_index 18
+```
+
+ただし、サンプルがエンコードされると、次のようになります。
+
+```py
+>>> encoding = tokenizer(example["question"], example["words"], example["boxes"])
+>>> tokenizer.decode(encoding["input_ids"])
+[CLS] who is in cc in this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ...
+```
+
+エンコードされた入力内で答えの位置を見つける必要があります。
+* `token_type_ids` は、どのトークンが質問の一部であり、どのトークンが文書の単語の一部であるかを示します。
+* `tokenizer.cls_token_id` は、入力の先頭で特別なトークンを見つけるのに役立ちます。
+* `word_ids` は、元の `words` で見つかった回答を、完全にエンコードされた入力内の同じ回答と照合して判断するのに役立ちます。
+エンコードされた入力内の応答の開始/終了位置。
+
+これを念頭に置いて、データセット内のサンプルのバッチをエンコードする関数を作成しましょう。
+
+
+```py
+>>> def encode_dataset(examples, max_length=512):
+...     questions = examples["question"]
+...     words = examples["words"]
+...     boxes = examples["boxes"]
+...     answers = examples["answer"]
+
+...     # encode the batch of examples and initialize the start_positions and end_positions
+...     encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True)
+...     start_positions = []
+...     end_positions = []
+
+...     # loop through the examples in the batch
+...     for i in range(len(questions)):
+...         cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id)
+
+...         # find the position of the answer in example's words
+...         words_example = [word.lower() for word in words[i]]
+...         answer = answers[i]
+...         match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
+
+...         if match:
+...             # if match is found, use `token_type_ids` to find where words start in the encoding
+...             token_type_ids = encoding["token_type_ids"][i]
+...             token_start_index = 0
+...             while token_type_ids[token_start_index] != 1:
+...                 token_start_index += 1
+
+...             token_end_index = len(encoding["input_ids"][i]) - 1
+...             while token_type_ids[token_end_index] != 1:
+...                 token_end_index -= 1
+
+...             word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1]
+...             start_position = cls_index
+...             end_position = cls_index
+
+...             # loop over word_ids and increase `token_start_index` until it matches the answer position in words
+...             # once it matches, save the `token_start_index` as the `start_position` of the answer in the encoding
+...             for id in word_ids:
+...                 if id == word_idx_start:
+...                     start_position = token_start_index
+...                 else:
+...                     token_start_index += 1
+
+...             # similarly loop over `word_ids` starting from the end to find the `end_position` of the answer
+...             for id in word_ids[::-1]:
+...                 if id == word_idx_end:
+...                     end_position = token_end_index
+...                 else:
+...                     token_end_index -= 1
+
+...             start_positions.append(start_position)
+...             end_positions.append(end_position)
+
+...         else:
+...             start_positions.append(cls_index)
+...             end_positions.append(cls_index)
+
+...     encoding["image"] = examples["image"]
+...     encoding["start_positions"] = start_positions
+...     encoding["end_positions"] = end_positions
+
+...     return encoding
+```
+
+この前処理関数が完成したので、データセット全体をエンコードできます。
+
+```py
+>>> encoded_train_dataset = dataset_with_ocr["train"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names
+... )
+>>> encoded_test_dataset = dataset_with_ocr["test"].map(
+...     encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names
+... )
+```
+
+エンコードされたデータセットの特徴がどのようなものかを確認してみましょう。
+
+```py
+>>> encoded_train_dataset.features
+{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None),
+ 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
+ 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
+ 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None),
+ 'start_positions': Value(dtype='int64', id=None),
+ 'end_positions': Value(dtype='int64', id=None)}
+```
+
+## Evaluation
+
+文書の質問回答の評価には、大量の後処理が必要です。過剰摂取を避けるために
+現時点では、このガイドでは評価ステップを省略しています。 [`Trainer`] はトレーニング中に評価損失を計算するため、
+モデルのパフォーマンスについてまったくわからないわけではありません。抽出的質問応答は通常、F1/完全一致を使用して評価されます。
+自分で実装したい場合は、[質問応答の章](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) を確認してください。
+インスピレーションを得るためにハグフェイスコースの。
+
+## Train 
+
+おめでとう！このガイドの最も難しい部分を無事にナビゲートできたので、独自のモデルをトレーニングする準備が整いました。
+トレーニングには次の手順が含まれます。
+* 前処理と同じチェックポイントを使用して、[`AutoModelForDocumentQuestionAnswering`] でモデルを読み込みます。
+* [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。
+* サンプルをバッチ処理する関数を定義します。ここでは [`DefaultDataCollat​​or`] が適切に機能します。
+* モデル、データセット、データ照合器とともにトレーニング引数を [`Trainer`] に渡します。
+* [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)
+```
+
+[`TrainingArguments`] で、`output_dir` を使用してモデルの保存場所を指定し、必要に応じてハイパーパラメーターを構成します。
+モデルをコミュニティと共有したい場合は、`push_to_hub`を`True`に設定します (モデルをアップロードするには、Hugging Face にサインインする必要があります)。
+この場合、`output_dir`はモデルのチェックポイントがプッシュされるリポジトリの名前にもなります。
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> # REPLACE THIS WITH YOUR REPO ID
+>>> repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     evaluation_strategy="steps",
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+サンプルをまとめてバッチ処理するための単純なデータ照合器を定義します。
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+最後に、すべてをまとめて、[`~Trainer.train`] を呼び出します。
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=encoded_train_dataset,
+...     eval_dataset=encoded_test_dataset,
+...     tokenizer=processor,
+... )
+
+>>> trainer.train()
+```
+
+最終モデルを 🤗 Hub に追加するには、モデル カードを作成し、`push_to_hub` を呼び出します。
+
+```py
+>>> trainer.create_model_card()
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+LayoutLMv2 モデルを微調整し、🤗 ハブにアップロードしたので、それを推論に使用できます。もっとも単純な
+推論用に微調整されたモデルを試す方法は、それを [`Pipeline`] で使用することです。
+
+例を挙げてみましょう:
+```py
+>>> example = dataset["test"][2]
+>>> question = example["query"]["en"]
+>>> image = example["image"]
+>>> print(question)
+>>> print(example["answers"])
+'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?'
+['TRRF Vice President', 'lee a. waller']
+```
+
+次に、パイプラインをインスタンス化します。
+モデルを使用して質問への回答を文書化し、画像と質問の組み合わせをモデルに渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> qa_pipeline(image, question)
+[{'score': 0.9949808120727539,
+  'answer': 'Lee A. Waller',
+  'start': 55,
+  'end': 57}]
+```
+
+必要に応じて、パイプラインの結果を手動で複製することもできます。
+1. 画像と質問を取得し、モデルのプロセッサを使用してモデル用に準備します。
+2. モデルを通じて結果または前処理を転送します。
+3. モデルは`start_logits`と`end_logits`を返します。これらは、どのトークンが応答の先頭にあるのかを示し、
+どのトークンが回答の最後にありますか。どちらも形状 (batch_size、sequence_length) を持ちます。
+4. `start_logits` と `end_logits` の両方の最後の次元で argmax を取得し、予測される `start_idx` と `end_idx` を取得します。
+5. トークナイザーを使用して回答をデコードします。
+
+```py
+>>> import torch
+>>> from transformers import AutoProcessor
+>>> from transformers import AutoModelForDocumentQuestionAnswering
+
+>>> processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+>>> model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa")
+
+>>> with torch.no_grad():
+...     encoding = processor(image.convert("RGB"), question, return_tensors="pt")
+...     outputs = model(**encoding)
+...     start_logits = outputs.start_logits
+...     end_logits = outputs.end_logits
+...     predicted_start_idx = start_logits.argmax(-1).item()
+...     predicted_end_idx = end_logits.argmax(-1).item()
+
+>>> processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1])
+'lee a. waller'
+```
diff --git a/docs/source/ja/tasks/idefics.md b/docs/source/ja/tasks/idefics.md
new file mode 100644
index 00000000000000..7a6c0758815e1b
--- /dev/null
+++ b/docs/source/ja/tasks/idefics.md
@@ -0,0 +1,430 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Image tasks with IDEFICS
+
+[[open-in-colab]]
+
+個別のタスクは特殊なモデルを微調整することで対処できますが、別のアプローチも可能です。
+最近登場して人気を博しているのは、微調整を行わずにさまざまなタスクに大規模なモデルを使用することです。
+たとえば、大規模な言語モデルは、要約、翻訳、分類などの NLP タスクを処理できます。
+このアプローチは、テキストなどの単一のモダリティに限定されなくなりました。このガイドでは、次のような方法を説明します。
+IDEFICS と呼ばれる大規模なマルチモーダル モデルを使用して、画像とテキストのタスクを解決します。
+
+[IDEFICS](../model_doc/idefics) は、[Flamingo](https://huggingface.co/papers/2204.14198) に基づくオープンアクセスのビジョンおよび言語モデルです。
+DeepMind によって最初に開発された最先端の視覚言語モデル。モデルは任意の画像シーケンスを受け入れます
+テキストを入力し、出力として一貫したテキストを生成します。画像に関する質問に答えたり、視覚的なコンテンツについて説明したり、
+複数のイメージに基づいたストーリーを作成するなど。 IDEFICS には 2 つのバリエーションがあります - [800 億パラメータ](https://huggingface.co/HuggingFaceM4/idefics-80b)
+および [90 億のパラメータ](https://huggingface.co/HuggingFaceM4/idefics-9b)、どちらも 🤗 Hub で入手できます。各バリエーションについて、細かく調整された指示も見つけることができます。
+会話のユースケースに適応したモデルのバージョン。
+
+このモデルは非常に多用途で、幅広い画像タスクやマルチモーダル タスクに使用できます。しかし、
+大規模なモデルであるということは、大量の計算リソースとインフラストラクチャが必要であることを意味します。それはあなた次第です
+このアプローチは、個別のタスクごとに特化したモデルを微調整するよりも、ユースケースに適しています。
+
+このガイドでは、次の方法を学習します。
+- [IDEFICS をロード](#loading-the-model) および [モデルの量子化バージョンをロード](#loading-the-quantized-version-of-the-model)
+- IDEFICS を次の目的で使用します。
+  - [画像キャプション](#image-captioning)
+  - [プロンプト画像キャプション](#prompted-image-captioning)
+  - [Few-shot プロンプト](#few-shot-prompting)
+  - [ビジュアル質問回答](#visual-question-answering)
+  - [画像分類](#image-classification)
+  - [画像ガイド付きテキスト生成](#image-guided-text-generation)
+- [バッチモードで推論を実行する](#running-inference-in-batch-mode)
+- [会話用に IDEFICS 命令を実行](#idefics-instruct-for-conversational-use)
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q bitsandbytes sentencepiece accelerate transformers
+```
+
+<Tip>
+量子化されていないバージョンのモデル チェックポイントを使用して次の例を実行するには、少なくとも 20GB の GPU メモリが必要です。
+</Tip>
+
+## Loading the model
+
+まずはモデルの 90 億個のパラメーターのチェックポイントをロードしましょう。
+
+```py
+>>> checkpoint = "HuggingFaceM4/idefics-9b"
+```
+
+他の Transformers モデルと同様に、プロセッサとモデル自体をチェックポイントからロードする必要があります。
+IDEFICS プロセッサは、[`LlamaTokenizer`] と IDEFICS 画像プロセッサを単一のプロセッサにラップして処理します。
+モデルのテキストと画像の入力を準備します。
+
+
+```py
+>>> import torch
+
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+`device_map`を`auto`に設定すると、モデルの重みを最も最適化された状態でロードおよび保存する方法が自動的に決定されます。
+既存のデバイスを考慮した方法。
+
+### Quantized model
+
+ハイメモリ GPU の可用性が問題となる場合は、モデルの量子化されたバージョンをロードできます。モデルと
+プロセッサを 4 ビット精度で使用する場合、`BitsAndBytesConfig`を`from_pretrained`メソッドに渡すと、モデルが圧縮されます。
+ロード中にその場で。
+
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
+
+>>> quantization_config = BitsAndBytesConfig(
+...     load_in_4bit=True,
+...     bnb_4bit_compute_dtype=torch.float16,
+... )
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(
+...     checkpoint,
+...     quantization_config=quantization_config,
+...     device_map="auto"
+... )
+```
+
+提案された方法のいずれかでモデルをロードしたので、IDEFICS を使用できるタスクの探索に進みましょう。
+
+## Image captioning
+
+画像のキャプション付けは、特定の画像のキャプションを予測するタスクです。一般的な用途は視覚障害者を支援することです
+人々はさまざまな状況をナビゲートします。たとえば、オンラインで画像コンテンツを探索します。
+
+タスクを説明するには、キャプションを付ける画像を取得します。例:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"/>
+</div>
+
+写真提供：[Hendo Wang](https://unsplash.com/@hendoo)
+
+IDEFICS はテキストと画像のプロンプトを受け入れます。ただし、画像にキャプションを付けるには、テキスト プロンプトをユーザーに提供する必要はありません。
+モデル、前処理された入力画像のみ。テキスト プロンプトがない場合、モデルはテキストの生成を開始します。
+BOS (Beginning-of-sequence) トークンによりキャプションが作成されます。
+
+モデルへの画像入力として、画像オブジェクト (`PIL.Image`) または画像を取得できる URL のいずれかを使用できます。
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+A puppy in a flower bed
+```
+
+<Tip>
+
+増加時に発生するエラーを避けるために、`generate`の呼び出しに`bad_words_ids`を含めることをお勧めします。
+`max_new_tokens`: モデルは、新しい `<image>` または `<fake_token_around_image>` トークンを生成する必要があります。
+モデルによって画像が生成されていません。
+このガイドのようにオンザフライで設定することも、[テキスト生成戦略](../generation_strategies) ガイドで説明されているように `GenerationConfig` に保存することもできます。
+</Tip>
+
+## Prompted image captioning
+
+テキスト プロンプトを提供することで画像キャプションを拡張でき、モデルは画像を指定して続行します。持っていきましょう
+別の図で説明します。
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"/>
+</div>
+
+写真提供：[Denys Nevozhai](https://unsplash.com/@dnevozhai)。
+   
+テキストおよび画像のプロンプトを単一のリストとしてモデルのプロセッサに渡し、適切な入力を作成できます。
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...     "This is an image of ",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+This is an image of the Eiffel Tower in Paris, France.
+```
+
+## Few-shot prompting
+
+IDEFICS はゼロショットで優れた結果を示しますが、タスクによっては特定の形式のキャプションが必要になる場合や、キャプションが付属する場合があります。
+タスクの複雑さを増大させるその他の制限または要件。少数のショットのプロンプトを使用して、コンテキスト内の学習を有効にすることができます。
+プロンプトに例を指定することで、指定された例の形式を模倣した結果を生成するようにモデルを操作できます。
+
+前のエッフェル塔の画像をモデルの例として使用し、モデルにデモンストレーションするプロンプトを作成してみましょう。
+画像内のオブジェクトが何であるかを知ることに加えて、それに関する興味深い情報も取得したいと考えています。
+次に、自由の女神の画像に対して同じ応答形式を取得できるかどうかを見てみましょう。
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"/>
+</div>
+
+写真提供：[Juan Mayobre](https://unsplash.com/@jmayobres)。
+
+```py
+>>> prompt = ["User:",
+...            "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...            "Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n",
+...            "User:",
+...            "https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80",
+...            "Describe this image.\nAssistant:"
+...            ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=30, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+User: Describe this image.
+Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building. 
+User: Describe this image.
+Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty is 151 feet tall.
+```
+
+モデルは 1 つの例 (つまり、1 ショット) だけからタスクの実行方法を学習していることに注目してください。より複雑なタスクの場合は、
+より多くの例 (3 ショット、5 ショットなど) を自由に試してみてください。
+
+## Visual question answering
+
+Visual Question Answering (VQA) は、画像に基づいて自由形式の質問に答えるタスクです。画像に似ている
+キャプションは、アクセシビリティ アプリケーションだけでなく、教育 (視覚資料についての推論) にも使用できます。
+サービス（画像を基にした商品に関する質問）、画像検索など。
+
+このタスク用に新しい画像を取得しましょう。
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"/>
+</div>
+
+写真提供  [Jarritos Mexican Soda](https://unsplash.com/@jarritos).
+
+適切な指示をプロンプトすることで、モデルを画像キャプションから視覚的な質問への応答に導くことができます。
+
+```py
+>>> prompt = [
+...     "Instruction: Provide an answer to the question. Use the image to answer.\n",
+...     "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...     "Question: Where are these people and what's the weather like? Answer:"
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=20, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Provide an answer to the question. Use the image to answer.
+ Question: Where are these people and what's the weather like? Answer: They're in a park in New York City, and it's a beautiful day.
+```
+
+## Image classification
+
+IDEFICS は、次のデータを含むデータについて明示的にトレーニングしなくても、画像をさまざまなカテゴリに分類できます。
+これらの特定のカテゴリからのラベル付きの例。カテゴリのリストを指定し、その画像とテキストを使用して理解する
+機能を利用すると、モデルは画像がどのカテゴリに属する​​可能性が高いかを推測できます。
+
+たとえば、次のような野菜スタンドの画像があるとします。
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"/>
+</div>
+
+写真提供：[Peter Wendt](https://unsplash.com/@peterwendt)。
+
+画像を次のいずれかのカテゴリに分類するようにモデルに指示できます。
+
+```py
+>>> categories = ['animals','vegetables', 'city landscape', 'cars', 'office']
+>>> prompt = [f"Instruction: Classify the following image into a single category from the following list: {categories}.\n",
+...     "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",    
+...     "Category: "
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office'].
+Category: Vegetables
+```
+
+上の例では、画像を 1 つのカテゴリに分類するようにモデルに指示していますが、ランク分類を行うようにモデルに指示することもできます。
+
+## Image-guided text generation
+
+よりクリエイティブなアプリケーションの場合は、画像ガイド付きテキスト生成を使用して、画像に基づいてテキストを生成できます。これは可能です
+製品、広告、シーンの説明などを作成するのに役立ちます。
+
+IDEFICS に、赤いドアの単純な画像に基づいてストーリーを書くように促してみましょう。
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"/>
+</div>
+
+写真提供：[Craig Tidball](https://unsplash.com/@devonshiremedia)。
+
+```py
+>>> prompt = ["Instruction: Use the image to write a story. \n",
+...     "https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80",
+...     "Story: \n"]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, num_beams=2, max_new_tokens=200, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0]) 
+Instruction: Use the image to write a story. 
+ Story: 
+Once upon a time, there was a little girl who lived in a house with a red door.  She loved her red door.  It was the prettiest door in the whole world.
+
+One day, the little girl was playing in her yard when she noticed a man standing on her doorstep.  He was wearing a long black coat and a top hat.
+
+The little girl ran inside and told her mother about the man.
+
+Her mother said, “Don’t worry, honey.  He’s just a friendly ghost.”
+
+The little girl wasn’t sure if she believed her mother, but she went outside anyway.
+
+When she got to the door, the man was gone.
+
+The next day, the little girl was playing in her yard again when she noticed the man standing on her doorstep.
+
+He was wearing a long black coat and a top hat.
+
+The little girl ran
+```
+
+IDEFICS は玄関先にあるカボチャに気づき、幽霊に関する不気味なハロウィーンの話をしたようです。
+
+<Tip>
+
+このような長い出力の場合、テキスト生成戦略を微調整すると大きなメリットが得られます。これは役に立ちます
+生成される出力の品質が大幅に向上します。 [テキスト生成戦略](../generation_strategies) を確認してください。
+詳しく知ることができ。
+
+</Tip>
+
+## Running inference in batch mode
+
+これまでのすべてのセクションでは、IDEFICS を 1 つの例として説明しました。非常に似た方法で、推論を実行できます。
+プロンプトのリストを渡すことにより、サンプルのバッチを取得します。
+
+```py
+>>> prompts = [
+...     [   "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+... ]
+
+>>> inputs = processor(prompts, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i,t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n") 
+0:
+This is an image of the Eiffel Tower in Paris, France.
+
+1:
+This is an image of a couple on a picnic blanket.
+
+2:
+This is an image of a vegetable stand.
+```
+
+## IDEFICS instruct for conversational use
+
+会話型のユースケースの場合は、🤗 ハブでモデルの微調整された指示されたバージョンを見つけることができます。
+`HuggingFaceM4/idefics-80b-instruct` および `HuggingFaceM4/idefics-9b-instruct`。
+
+これらのチェックポイントは、教師ありモデルと命令モデルを組み合わせたそれぞれの基本モデルを微調整した結果です。
+データセットを微調整することで、ダウンストリームのパフォーマンスを向上させながら、会話設定でモデルをより使いやすくします。
+
+会話での使用とプロンプトは、基本モデルの使用と非常に似ています。
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+>>> checkpoint = "HuggingFaceM4/idefics-9b-instruct"
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> prompts = [
+...     [
+...         "User: What is in this image?",
+...         "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG",
+...         "<end_of_utterance>",
+
+...         "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>",
+
+...         "\nUser:",
+...         "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052",
+...         "And who is that?<end_of_utterance>",
+
+...         "\nAssistant:",
+...     ],
+... ]
+
+>>> # --batched mode
+>>> inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device)
+>>> # --single sample mode
+>>> # inputs = processor(prompts[0], return_tensors="pt").to(device)
+
+>>> # Generation args
+>>> exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i, t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n")
+```
diff --git a/docs/source/ja/tasks/image_captioning.md b/docs/source/ja/tasks/image_captioning.md
new file mode 100644
index 00000000000000..c499cbacc9effe
--- /dev/null
+++ b/docs/source/ja/tasks/image_captioning.md
@@ -0,0 +1,276 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image captioning
+
+[[open-in-colab]]
+
+画像のキャプション付けは、特定の画像のキャプションを予測するタスクです。一般的な現実世界のアプリケーションには次のものがあります。
+視覚障害者がさまざまな状況を乗り越えられるよう支援します。したがって、画像のキャプション
+画像を説明することで人々のコンテンツへのアクセシビリティを向上させるのに役立ちます。
+
+このガイドでは、次の方法を説明します。
+
+* 画像キャプション モデルを微調整します。
+* 微調整されたモデルを推論に使用します。
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate -q
+pip install jiwer -q
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+## Load the Pokémon BLIP captions dataset
+
+🤗 データセット ライブラリを使用して、{image-caption} ペアで構成されるデータセットを読み込みます。独自の画像キャプション データセットを作成するには
+PyTorch では、[このノートブック](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/GIT/Fine_tune_GIT_on_an_image_captioning_dataset.ipynb) を参照できます。
+
+```py
+ds = load_dataset("lambdalabs/pokemon-blip-captions")
+ds
+```
+
+```bash
+DatasetDict({
+    train: Dataset({
+        features: ['image', 'text'],
+        num_rows: 833
+    })
+})
+```
+
+データセットには `image`と`text`の 2 つの機能があります。
+
+<Tip>
+
+多くの画像キャプション データセットには、画像ごとに複数のキャプションが含まれています。このような場合、一般的な戦略は、トレーニング中に利用可能なキャプションの中からランダムにキャプションをサンプリングすることです。
+
+</Tip>
+
+[~datasets.Dataset.train_test_split] メソッドを使用して、データセットのトレイン スプリットをトレイン セットとテスト セットに分割します。
+
+```python
+ds = ds["train"].train_test_split(test_size=0.1)
+train_ds = ds["train"]
+test_ds = ds["test"]
+```
+
+トレーニング セットからのいくつかのサンプルを視覚化してみましょう。
+
+```python
+from textwrap import wrap
+import matplotlib.pyplot as plt
+import numpy as np
+
+
+def plot_images(images, captions):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        caption = captions[i]
+        caption = "\n".join(wrap(caption, 12))
+        plt.title(caption)
+        plt.imshow(images[i])
+        plt.axis("off")
+
+
+sample_images_to_visualize = [np.array(train_ds[i]["image"]) for i in range(5)]
+sample_captions = [train_ds[i]["text"] for i in range(5)]
+plot_images(sample_images_to_visualize, sample_captions)
+```
+    
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_training_images_image_cap.png" alt="Sample training images"/>
+</div>
+
+## Preprocess the dataset
+
+データセットには 2 つのモダリティ (画像とテキスト) があるため、前処理パイプラインは画像とキャプションを前処理します。
+
+これを行うには、微調整しようとしているモデルに関連付けられたプロセッサ クラスをロードします。
+
+```python
+from transformers import AutoProcessor
+
+checkpoint = "microsoft/git-base"
+processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+
+プロセッサは内部で画像を前処理し (サイズ変更やピクセル スケーリングを含む)、キャプションをトークン化します。
+
+```python
+def transforms(example_batch):
+    images = [x for x in example_batch["image"]]
+    captions = [x for x in example_batch["text"]]
+    inputs = processor(images=images, text=captions, padding="max_length")
+    inputs.update({"labels": inputs["input_ids"]})
+    return inputs
+
+
+train_ds.set_transform(transforms)
+test_ds.set_transform(transforms)
+```
+
+データセットの準備ができたら、微調整用にモデルをセットアップできます。
+
+## Load a base model
+
+["microsoft/git-base"](https://huggingface.co/microsoft/git-base) を [`AutoModelForCausalLM`](https://huggingface.co/docs/transformers/model_doc/auto#transformers.AutoModelForCausalLM) オブジェクト。
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+```
+
+```python
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(checkpoint)
+```
+## Evaluate
+
+画像キャプション モデルは通常、[Rouge Score](https://huggingface.co/spaces/evaluate-metric/rouge) または [Word Error Rate](https://huggingface.co/spaces/evaluate-metric/) で評価されます。そうだった）。このガイドでは、Word Error Rate (WER) を使用します。
+
+これを行うには 🤗 Evaluate ライブラリを使用します。 WER の潜在的な制限やその他の問題点については、[このガイド](https://huggingface.co/spaces/evaluate-metric/wer) を参照してください。
+
+```python
+from evaluate import load
+import torch
+
+wer = load("wer")
+
+
+def compute_metrics(eval_pred):
+    logits, labels = eval_pred
+    predicted = logits.argmax(-1)
+    decoded_labels = processor.batch_decode(labels, skip_special_tokens=True)
+    decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True)
+    wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels)
+    return {"wer_score": wer_score}
+```
+
+## Train!
+
+これで、モデルの微調整を開始する準備が整いました。これには 🤗 [`Trainer`] を使用します。
+
+まず、[`TrainingArguments`] を使用してトレーニング引数を定義します。
+
+```python
+from transformers import TrainingArguments, Trainer
+
+model_name = checkpoint.split("/")[1]
+
+training_args = TrainingArguments(
+    output_dir=f"{model_name}-pokemon",
+    learning_rate=5e-5,
+    num_train_epochs=50,
+    fp16=True,
+    per_device_train_batch_size=32,
+    per_device_eval_batch_size=32,
+    gradient_accumulation_steps=2,
+    save_total_limit=3,
+    evaluation_strategy="steps",
+    eval_steps=50,
+    save_strategy="steps",
+    save_steps=50,
+    logging_steps=50,
+    remove_unused_columns=False,
+    push_to_hub=True,
+    label_names=["labels"],
+    load_best_model_at_end=True,
+)
+```
+
+Trainer 次に、次に、データセットとモデルと一緒に 🤗 に渡します。
+
+```python
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_ds,
+    eval_dataset=test_ds,
+    compute_metrics=compute_metrics,
+)
+```
+
+トレーニングを開始するには、[`Trainer`] オブジェクトの [`~Trainer.train`] を呼び出すだけです。
+
+```python 
+trainer.train()
+```
+
+トレーニングが進むにつれて、トレーニングの損失がスムーズに減少することがわかります。
+
+トレーニングが完了したら、 [`~Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```python
+trainer.push_to_hub()
+```
+
+## Inference
+
+`test_ds` からサンプル画像を取得してモデルをテストします。
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png"
+image = Image.open(requests.get(url, stream=True).raw)
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/test_image_image_cap.png" alt="Test image"/>
+</div>
+
+モデル用の画像を準備します。
+
+```python
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+inputs = processor(images=image, return_tensors="pt").to(device)
+pixel_values = inputs.pixel_values
+```
+
+[`generate`] を呼び出して予測をデコードします。
+
+```python
+generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+print(generated_caption)
+```
+```bash
+a drawing of a pink and blue pokemon
+```
+
+微調整されたモデルにより、非常に優れたキャプションが生成されたようです。
+
+
+
+
diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md
new file mode 100644
index 00000000000000..f8d8d0d55238b9
--- /dev/null
+++ b/docs/source/ja/tasks/image_classification.md
@@ -0,0 +1,559 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Image classification
+
+[[open-in-colab]]
+
+<Youtube id="tjAIM7BOYhw"/>
+
+画像分類では、画像にラベルまたはクラスを割り当てます。テキストや音声の分類とは異なり、入力は
+画像を構成するピクセル値。損傷の検出など、画像分類には多くの用途があります
+自然災害の後、作物の健康状態を監視したり、病気の兆候がないか医療画像をスクリーニングしたりするのに役立ちます。
+
+このガイドでは、次の方法を説明します。
+
+1. [Food-101](https://huggingface.co/datasets/food101) データセットの [ViT](model_doc/vit) を微調整して、画像内の食品を分類します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [BiT](../model_doc/bit), [ConvNeXT](../model_doc/convnext), [ConvNeXTV2](../model_doc/convnextv2), [CvT](../model_doc/cvt), [Data2VecVision](../model_doc/data2vec-vision), [DeiT](../model_doc/deit), [DiNAT](../model_doc/dinat), [DINOv2](../model_doc/dinov2), [EfficientFormer](../model_doc/efficientformer), [EfficientNet](../model_doc/efficientnet), [FocalNet](../model_doc/focalnet), [ImageGPT](../model_doc/imagegpt), [LeViT](../model_doc/levit), [MobileNetV1](../model_doc/mobilenet_v1), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [NAT](../model_doc/nat), [Perceiver](../model_doc/perceiver), [PoolFormer](../model_doc/poolformer), [PVT](../model_doc/pvt), [RegNet](../model_doc/regnet), [ResNet](../model_doc/resnet), [SegFormer](../model_doc/segformer), [SwiftFormer](../model_doc/swiftformer), [Swin Transformer](../model_doc/swin), [Swin Transformer V2](../model_doc/swinv2), [VAN](../model_doc/van), [ViT](../model_doc/vit), [ViT Hybrid](../model_doc/vit_hybrid), [ViTMSN](../model_doc/vit_msn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Hugging Face アカウントにログインして、モデルをアップロードしてコミュニティと共有することをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load Food-101 dataset
+
+Datasets、🤗 データセット ライブラリから Food-101 データセットの小さいサブセットを読み込みます。これにより、次の機会が得られます
+完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認してください。
+
+```py
+>>> from datasets import load_dataset
+
+>>> food = load_dataset("food101", split="train[:5000]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train` 分割をトレイン セットとテスト セットに分割します。
+
+
+```py
+>>> food = food.train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> food["train"][0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512 at 0x7F52AFC8AC50>,
+ 'label': 79}
+```
+
+データセット内の各例には 2 つのフィールドがあります。
+
+- `image`: 食品の PIL 画像
+- `label`: 食品のラベルクラス
+
+モデルがラベル ID からラベル名を取得しやすくするために、ラベル名をマップする辞書を作成します。
+整数への変換、またはその逆:
+
+```py
+>>> labels = food["train"].features["label"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+これで、ラベル ID をラベル名に変換できるようになりました。
+
+```py
+>>> id2label[str(79)]
+'prime_rib'
+```
+
+## Preprocess
+
+次のステップでは、ViT 画像プロセッサをロードして画像をテンソルに処理します。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "google/vit-base-patch16-224-in21k"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+
+<frameworkcontent>
+<pt>
+
+いくつかの画像変換を画像に適用して、モデルの過学習に対する堅牢性を高めます。ここでは torchvision の [`transforms`](https://pytorch.org/vision/stable/transforms.html) モジュールを使用しますが、任意の画像ライブラリを使用することもできます。
+
+画像のランダムな部分をトリミングし、サイズを変更し、画像の平均と標準偏差で正規化します。
+
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
+
+>>> normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+>>> _transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])
+```
+
+次に、変換を適用し、画像の `pixel_values` (モデルへの入力) を返す前処理関数を作成します。
+
+```py
+>>> def transforms(examples):
+...     examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     del examples["image"]
+...     return examples
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.with_transform`] メソッドを使用します。変換は、データセットの要素を読み込むときにオンザフライで適用されます。
+
+```py
+>>> food = food.with_transform(transforms)
+```
+
+次に、[`DefaultDataCollat​​or`] を使用してサンプルのバッチを作成します。 🤗 Transformers の他のデータ照合器とは異なり、`DefaultDataCollat​​or` はパディングなどの追加の前処理を適用しません。
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+過剰適合を回避し、モデルをより堅牢にするために、データセットのトレーニング部分にデータ拡張を追加します。
+ここでは、Keras 前処理レイヤーを使用してトレーニング データの変換 (データ拡張を含む) を定義します。
+検証データの変換 (中央のトリミング、サイズ変更、正規化のみ)。 `tf.image` または
+他のライブラリでも構いません。
+
+
+```py
+>>> from tensorflow import keras
+>>> from tensorflow.keras import layers
+
+>>> size = (image_processor.size["height"], image_processor.size["width"])
+
+>>> train_data_augmentation = keras.Sequential(
+...     [
+...         layers.RandomCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...         layers.RandomFlip("horizontal"),
+...         layers.RandomRotation(factor=0.02),
+...         layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+...     ],
+...     name="train_data_augmentation",
+... )
+
+>>> val_data_augmentation = keras.Sequential(
+...     [
+...         layers.CenterCrop(size[0], size[1]),
+...         layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+...     ],
+...     name="val_data_augmentation",
+... )
+```
+
+次に、一度に 1 つの画像ではなく、画像のバッチに適切な変換を適用する関数を作成します。
+
+```py
+>>> import numpy as np
+>>> import tensorflow as tf
+>>> from PIL import Image
+
+
+>>> def convert_to_tf_tensor(image: Image):
+...     np_image = np.array(image)
+...     tf_image = tf.convert_to_tensor(np_image)
+...     # `expand_dims()` is used to add a batch dimension since
+...     # the TF augmentation layers operates on batched inputs.
+...     return tf.expand_dims(tf_image, 0)
+
+
+>>> def preprocess_train(example_batch):
+...     """Apply train_transforms across a batch."""
+...     images = [
+...         train_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+
+
+... def preprocess_val(example_batch):
+...     """Apply val_transforms across a batch."""
+...     images = [
+...         val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"]
+...     ]
+...     example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images]
+...     return example_batch
+```
+
+🤗 データセット [`~datasets.Dataset.set_transform`] を使用して、その場で変換を適用します。
+
+```py
+food["train"].set_transform(preprocess_train)
+food["test"].set_transform(preprocess_val)
+```
+
+最後の前処理ステップとして、`DefaultDataCollat​​or`を使用してサンプルのバッチを作成します。 🤗 Transformers の他のデータ照合機能とは異なり、
+`DefaultDataCollat​​or` は、パディングなどの追加の前処理を適用しません。
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。すぐにロードできます
+🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用した評価方法。このタスクでは、ロードします
+[accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) 指標 (詳細については、🤗 評価 [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してくださいメトリクスをロードして計算する方法):
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して精度を計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+これで `compute_metrics`関数の準備が整いました。トレーニングを設定するときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForImageClassification`] を使用して ViT をロードします。ラベルの数と予想されるラベルの数、およびラベル マッピングを指定します。
+
+```py
+>>> from transformers import AutoModelForImageClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     num_labels=len(labels),
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+この時点で残っているステップは 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。 `image` 列が削除されるため、未使用の列を削除しないことが重要です。 `image` 列がないと、`pixel_values` を作成できません。この動作を防ぐには、`remove_unused_columns=False`を設定してください。他に必要なパラメータは、モデルの保存場所を指定する `output_dir` だけです。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は精度を評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_food_model",
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=food["train"],
+...     eval_dataset=food["test"],
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+<Tip>
+
+
+Keras を使用したモデルの微調整に慣れていない場合は、まず [基本チュートリアル](./training#train-a-tensorflow-model-with-keras) を確認してください。
+
+</Tip>
+
+
+TensorFlow でモデルを微調整するには、次の手順に従います。
+1. トレーニングのハイパーパラメータを定義し、オプティマイザーと学習率スケジュールを設定します。
+2. 事前トレーニングされたモデルをインスタンス化します。
+3. 🤗 データセットを `tf.data.Dataset` に変換します。
+4. モデルをコンパイルします。
+5. コールバックを追加し、`fit()` メソッドを使用してトレーニングを実行します。
+6. モデルを 🤗 Hub にアップロードしてコミュニティと共有します。
+
+まず、ハイパーパラメーター、オプティマイザー、学習率スケジュールを定義します。
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 5
+>>> num_train_steps = len(food["train"]) * num_epochs
+>>> learning_rate = 3e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+次に、ラベル マッピングとともに [`TFAutoModelForImageClassification`] を使用して ViT を読み込みます。
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+```
+
+Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Dataset.to_tf_dataset`] and your `data_collator`:
+
+```py
+>>> # converting our train dataset to tf.data.Dataset
+>>> tf_train_dataset = food["train"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+
+>>> # converting our test dataset to tf.data.Dataset
+>>> tf_eval_dataset = food["test"].to_tf_dataset(
+...     columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator
+... )
+```
+
+`compile()` を使用してトレーニング用にモデルを設定します。
+
+```py
+>>> from tensorflow.keras.losses import SparseCategoricalCrossentropy
+
+>>> loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+>>> model.compile(optimizer=optimizer, loss=loss)
+```
+
+予測から精度を計算し、モデルを 🤗 ハブにプッシュするには、[Keras callbacks](../main_classes/keras_callbacks) を使用します。
+`compute_metrics` 関数を [KerasMetricCallback](../main_classes/keras_callbacks#transformers.KerasMetricCallback) に渡します。
+[PushToHubCallback](../main_classes/keras_callbacks#transformers.PushToHubCallback) を使用してモデルをアップロードします。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset)
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="food_classifier",
+...     tokenizer=image_processor,
+...     save_strategy="no",
+... )
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルをトレーニングする準備が整いました。トレーニングおよび検証データセット、エポック数、
+モデルを微調整するためのコールバック:
+
+```py
+>>> model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks)
+Epoch 1/5
+250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290
+Epoch 2/5
+250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690
+Epoch 3/5
+250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820
+Epoch 4/5
+250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900
+Epoch 5/5
+250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890
+```
+
+おめでとう！モデルを微調整し、🤗 Hub で共有しました。これで推論に使用できるようになりました。
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+画像分類用のモデルを微調整する方法の詳細な例については、対応する [PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb)
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論を実行したい画像を読み込みます。
+
+```py
+>>> ds = load_dataset("food101", split="validation[:10]")
+>>> image = ds["image"][0]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png" alt="image of beignets"/>
+</div>
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して画像分類用の`pipeline`をインスタンス化し、それに画像を渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("image-classification", model="my_awesome_food_model")
+>>> classifier(image)
+[{'score': 0.31856709718704224, 'label': 'beignets'},
+ {'score': 0.015232225880026817, 'label': 'bruschetta'},
+ {'score': 0.01519392803311348, 'label': 'chicken_wings'},
+ {'score': 0.013022331520915031, 'label': 'pork_chop'},
+ {'score': 0.012728818692266941, 'label': 'prime_rib'}]
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+
+画像プロセッサをロードして画像を前処理し、`input`を PyTorch テンソルとして返します。
+
+```py
+>>> from transformers import AutoImageProcessor
+>>> import torch
+
+>>> image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model")
+>>> inputs = image_processor(image, return_tensors="pt")
+```
+
+入力をモデルに渡し、ロジットを返します。
+
+```py
+>>> from transformers import AutoModelForImageClassification
+
+>>> model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+最も高い確率で予測されたラベルを取得し、モデルの `id2label` マッピングを使用してラベルに変換します。
+
+```py
+>>> predicted_label = logits.argmax(-1).item()
+>>> model.config.id2label[predicted_label]
+'beignets'
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+画像プロセッサをロードして画像を前処理し、`input`を TensorFlow テンソルとして返します。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+入力をモデルに渡し、ロジットを返します。
+
+```py
+>>> from transformers import TFAutoModelForImageClassification
+
+>>> model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier")
+>>> logits = model(**inputs).logits
+```
+
+最も高い確率で予測されたラベルを取得し、モデルの `id2label` マッピングを使用してラベルに変換します。
+
+
+```py
+>>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
+>>> model.config.id2label[predicted_class_id]
+'beignets'
+```
+
+</tf>
+</frameworkcontent>
+
diff --git a/docs/source/ja/tasks/image_to_image.md b/docs/source/ja/tasks/image_to_image.md
new file mode 100644
index 00000000000000..cd429a4e1e27c3
--- /dev/null
+++ b/docs/source/ja/tasks/image_to_image.md
@@ -0,0 +1,135 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image-to-Image Task Guide
+
+[[open-in-colab]]
+
+Image-to-Image タスクは、アプリケーションが画像を受信し、別の画像を出力するタスクです。これには、画像強化 (超解像度、低光量強化、ディレインなど)、画像修復などを含むさまざまなサブタスクがあります。
+
+このガイドでは、次の方法を説明します。
+- 超解像度タスクに画像間のパイプラインを使用します。
+- パイプラインを使用せずに、同じタスクに対してイメージ間モデルを実行します。
+
+このガイドがリリースされた時点では、`image-to-image`パイプラインは超解像度タスクのみをサポートしていることに注意してください。
+
+必要なライブラリをインストールすることから始めましょう。
+
+```bash
+pip install transformers
+```
+
+[Swin2SR モデル](https://huggingface.co/caidas/swin2SR-lightweight-x2-64) を使用してパイプラインを初期化できるようになりました。次に、イメージを使用してパイプラインを呼び出すことで、パイプラインを推論できます。現時点では、[Swin2SR モデル](https://huggingface.co/models?sort=trending&search=swin2sr) のみがこのパイプラインでサポートされています。
+
+```python
+from transformers import pipeline
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)
+```
+
+では、画像を読み込みましょう。
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+print(image.size)
+```
+```bash
+# (532, 432)
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg" alt="Photo of a cat"/>
+</div>
+
+
+これで、パイプラインを使用して推論を実行できるようになりました。猫の画像の拡大バージョンを取得します。
+
+```python
+upscaled = pipe(image)
+print(upscaled.size)
+```
+```bash
+# (1072, 880)
+```
+
+パイプラインを使用せずに自分で推論を実行したい場合は、トランスフォーマーの `Swin2SRForImageSuperResolution` クラスと `Swin2SRImageProcessor` クラスを使用できます。これには同じモデルのチェックポイントを使用します。モデルとプロセッサを初期化しましょう。
+
+```python
+from transformers import Swin2SRForImageSuperResolution, Swin2SRImageProcessor 
+
+model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-lightweight-x2-64").to(device)
+processor = Swin2SRImageProcessor("caidas/swin2SR-lightweight-x2-64")
+```
+
+`pipeline`」は、自分で行う必要がある前処理と後処理のステップを抽象化するので、画像を前処理しましょう。画像をプロセッサに渡してから、ピクセル値を GPU に移動します。
+
+```python
+pixel_values = processor(image, return_tensors="pt").pixel_values
+print(pixel_values.shape)
+
+pixel_values = pixel_values.to(device)
+```
+
+これで、ピクセル値をモデルに渡すことで画像を推測できるようになりました。
+
+```python
+import torch
+
+with torch.no_grad():
+  outputs = model(pixel_values)
+```
+
+出力は、以下のような `ImageSuperResolutionOutput` タイプのオブジェクトです 👇
+
+```
+(loss=None, reconstruction=tensor([[[[0.8270, 0.8269, 0.8275,  ..., 0.7463, 0.7446, 0.7453],
+          [0.8287, 0.8278, 0.8283,  ..., 0.7451, 0.7448, 0.7457],
+          [0.8280, 0.8273, 0.8269,  ..., 0.7447, 0.7446, 0.7452],
+          ...,
+          [0.5923, 0.5933, 0.5924,  ..., 0.0697, 0.0695, 0.0706],
+          [0.5926, 0.5932, 0.5926,  ..., 0.0673, 0.0687, 0.0705],
+          [0.5927, 0.5914, 0.5922,  ..., 0.0664, 0.0694, 0.0718]]]],
+       device='cuda:0'), hidden_states=None, attentions=None)
+```
+
+`reconstruction`を取得し、それを視覚化するために後処理する必要があります。どのように見えるか見てみましょう。
+
+```python
+outputs.reconstruction.data.shape
+# torch.Size([1, 3, 880, 1072])
+```
+
+出力を圧縮して軸 0 を削除し、値をクリップしてから、それを numpy float に変換する必要があります。次に、軸を [1072, 880] の形状になるように配置し、最後に出力を範囲 [0, 255] に戻します。
+
+```python
+import numpy as np
+
+# squeeze, take to CPU and clip the values
+output = outputs.reconstruction.data.squeeze().cpu().clamp_(0, 1).numpy()
+# rearrange the axes
+output = np.moveaxis(output, source=0, destination=-1)
+# bring values back to pixel values range
+output = (output * 255.0).round().astype(np.uint8)
+Image.fromarray(output)
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat_upscaled.png" alt="Upscaled photo of a cat"/>
+</div>
\ No newline at end of file
diff --git a/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
new file mode 100644
index 00000000000000..204b1f1e2f88c3
--- /dev/null
+++ b/docs/source/ja/tasks/knowledge_distillation_for_image_classification.md
@@ -0,0 +1,188 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+# Knowledge Distillation for Computer Vision
+
+[[open-in-colab]]
+
+知識の蒸留は、より大規模で複雑なモデル (教師) からより小規模で単純なモデル (生徒) に知識を伝達するために使用される手法です。あるモデルから別のモデルに知識を抽出するには、特定のタスク (この場合は画像分類) でトレーニングされた事前トレーニング済み教師モデルを取得し、画像分類でトレーニングされる生徒モデルをランダムに初期化します。次に、学生モデルをトレーニングして、その出力と教師の出力の差を最小限に抑え、動作を模倣します。これは [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531) で最初に導入されました。このガイドでは、タスク固有の知識の蒸留を行います。これには [Beans データセット](https://huggingface.co/datasets/beans) を使用します。
+
+このガイドでは、[微調整された ViT モデル](https://huggingface.co/merve/vit-mobilenet-beans-224) (教師モデル) を抽出して [MobileNet](https://huggingface. co/google/mobilenet_v2_1.4_224) (学生モデル) 🤗 Transformers の [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) を使用します。
+
+蒸留とプロセスの評価に必要なライブラリをインストールしましょう。
+
+```bash
+pip install transformers datasets accelerate tensorboard evaluate --upgrade
+```
+
+この例では、教師モデルとして`merve/beans-vit-224`モデルを使用しています。これは、Bean データセットに基づいて微調整された`google/vit-base-patch16-224-in21k`に基づく画像分類モデルです。このモデルをランダムに初期化された MobileNetV2 に抽出します。
+
+次に、データセットをロードします。
+
+```python
+from datasets import load_dataset
+
+dataset = load_dataset("beans")
+```
+
+この場合、同じ解像度で同じ出力が返されるため、どちらのモデルの画像プロセッサも使用できます。 `dataset`の`map()`メソッドを使用して、データセットのすべての分割に前処理を適用します。
+
+```python
+from transformers import AutoImageProcessor
+teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224")
+
+def process(examples):
+    processed_inputs = teacher_processor(examples["image"])
+    return processed_inputs
+
+processed_datasets = dataset.map(process, batched=True)
+```
+
+基本的に、我々は生徒モデル（ランダムに初期化されたMobileNet）が教師モデル（微調整されたビジョン変換器）を模倣することを望む。これを実現するために、まず教師と生徒からロジット出力を得る。次に、それぞれのソフトターゲットの重要度を制御するパラメータ`temperature`で分割する。`lambda`と呼ばれるパラメータは蒸留ロスの重要度を量る。この例では、`temperature=5`、`lambda=0.5`とする。生徒と教師の間の発散を計算するために、Kullback-Leibler発散損失を使用します。2つのデータPとQが与えられたとき、KLダイバージェンスはQを使ってPを表現するためにどれだけの余分な情報が必要かを説明します。もし2つが同じであれば、QからPを説明するために必要な他の情報はないので、それらのKLダイバージェンスはゼロになります。
+
+
+```python
+from transformers import TrainingArguments, Trainer
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class ImageDistilTrainer(Trainer):
+    def __init__(self, *args, teacher_model=None, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.teacher = teacher_model
+        self.student = student_model
+        self.loss_function = nn.KLDivLoss(reduction="batchmean")
+        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+        self.teacher.to(device)
+        self.teacher.eval()
+        self.temperature = temperature
+        self.lambda_param = lambda_param
+
+    def compute_loss(self, student, inputs, return_outputs=False):
+        student_output = self.student(**inputs)
+
+        with torch.no_grad():
+          teacher_output = self.teacher(**inputs)
+
+        # Compute soft targets for teacher and student
+        soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
+        soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)
+
+        # Compute the loss
+        distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)
+
+        # Compute the true label loss
+        student_target_loss = student_output.loss
+
+        # Calculate final loss
+        loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
+        return (loss, student_output) if return_outputs else loss
+```
+
+次に、Hugging Face Hub にログインして、`trainer`を通じてモデルを Hugging Face Hub にプッシュできるようにします。
+
+```python
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+教師モデルと生徒モデルである`TrainingArguments`を設定しましょう。
+
+```python
+from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
+
+training_args = TrainingArguments(
+    output_dir="my-awesome-model",
+    num_train_epochs=30,
+    fp16=True,
+    logging_dir=f"{repo_name}/logs",
+    logging_strategy="epoch",
+    evaluation_strategy="epoch",
+    save_strategy="epoch",
+    load_best_model_at_end=True,
+    metric_for_best_model="accuracy",
+    report_to="tensorboard",
+    push_to_hub=True,
+    hub_strategy="every_save",
+    hub_model_id=repo_name,
+    )
+
+num_labels = len(processed_datasets["train"].features["labels"].names)
+
+# initialize models
+teacher_model = AutoModelForImageClassification.from_pretrained(
+    "merve/beans-vit-224",
+    num_labels=num_labels,
+    ignore_mismatched_sizes=True
+)
+
+# training MobileNetV2 from scratch
+student_config = MobileNetV2Config()
+student_config.num_labels = num_labels
+student_model = MobileNetV2ForImageClassification(student_config)
+```
+
+`compute_metrics` 関数を使用して、テスト セットでモデルを評価できます。この関数は、トレーニング プロセス中にモデルの`accuracy`と`f1`を計算するために使用されます。
+
+```python
+import evaluate
+import numpy as np
+
+accuracy = evaluate.load("accuracy")
+
+def compute_metrics(eval_pred):
+    predictions, labels = eval_pred
+    acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1))
+    return {"accuracy": acc["accuracy"]}
+```
+
+定義したトレーニング引数を使用して`Trainer`を初期化しましょう。データ照合装置も初期化します。
+
+
+```python
+from transformers import DefaultDataCollator
+
+data_collator = DefaultDataCollator()
+trainer = ImageDistilTrainer(
+    student_model=student_model,
+    teacher_model=teacher_model,
+    training_args=training_args,
+    train_dataset=processed_datasets["train"],
+    eval_dataset=processed_datasets["validation"],
+    data_collator=data_collator,
+    tokenizer=teacher_extractor,
+    compute_metrics=compute_metrics,
+    temperature=5,
+    lambda_param=0.5
+)
+```
+
+これでモデルをトレーニングできるようになりました。
+
+```python
+trainer.train()
+```
+
+テスト セットでモデルを評価できます。
+
+
+```python
+trainer.evaluate(processed_datasets["test"])
+```
+
+テスト セットでは、モデルの精度は 72% に達します。蒸留効率の健全性チェックを行うために、同じハイパーパラメータを使用して Bean データセットで MobileNet を最初からトレーニングし、テスト セットで 63% の精度を観察しました。読者の皆様には、さまざまな事前トレーニング済み教師モデル、学生アーキテクチャ、蒸留パラメータを試していただき、その結果を報告していただくようお勧めします。抽出されたモデルのトレーニング ログとチェックポイントは [このリポジトリ](https://huggingface.co/merve/vit-mobilenet-beans-224) にあり、最初からトレーニングされた MobileNetV2 はこの [リポジトリ]( https://huggingface.co/merve/resnet-mobilenet-beans-5)。
diff --git a/docs/source/ja/tasks/language_modeling.md b/docs/source/ja/tasks/language_modeling.md
new file mode 100644
index 00000000000000..b7ad65c6c4a210
--- /dev/null
+++ b/docs/source/ja/tasks/language_modeling.md
@@ -0,0 +1,444 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Causal language modeling
+
+
+[[open-in-colab]]
+
+
+言語モデリングには、因果的モデリングとマスクされた言語モデリングの 2 つのタイプがあります。このガイドでは、因果関係のある言語モデリングについて説明します。
+因果言語モデルはテキスト生成によく使用されます。これらのモデルは、次のようなクリエイティブなアプリケーションに使用できます。
+独自のテキスト アドベンチャーを選択するか、Copilot や CodeParrot などのインテリジェントなコーディング アシスタントを選択します。
+
+<Youtube id="Vpjb1lu0MDk"/>
+
+
+因果言語モデリングは、一連のトークン内の次のトークンを予測します。モデルは、次のトークンにのみ対応できます。
+左。これは、モデルが将来のトークンを認識できないことを意味します。 GPT-2 は因果的言語モデルの一例です。
+
+このガイドでは、次の方法を説明します。
+
+1. [ELI5](https:/) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilGPT2](https://huggingface.co/distilgpt2) を微調整します。 /huggingface.co/datasets/eli5) データセット。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+
+このガイドと同じ手順に従って、因果言語モデリング用に他のアーキテクチャを微調整できます。
+次のアーキテクチャのいずれかを選択します。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load ELI5 dataset
+
+
+まず、ELI5 データセットの r/askscience サブセットの小さいサブセットを 🤗 データセット ライブラリからロードします。
+ これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train_asks` をトレイン セットとテスト セットに分割します。
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+これは多くのことのように見えるかもしれませんが、実際に関心があるのは`text`フィールドだけです。言語モデリングの優れている点
+タスクでは、次の単語がラベル * であるため、ラベル (教師なしタスクとも呼ばれます) は必要ありません。
+
+
+## Preprocess
+
+<Youtube id="ma1TrR7gE7I"/>
+
+
+次のステップは、`text`サブフィールドを処理するために DistilGPT2 トークナイザーをロードすることです。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+```
+
+上の例からわかるように、`text`フィールドは実際には`answers`内にネストされています。つまり、次のことが必要になります。
+[` flatten`](https://huggingface.co/docs/datasets/process.html#flatten) メソッドを使用して、ネストされた構造から `text` サブフィールドを抽出します。
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+`answers`接頭辞で示されるように、各サブフィールドは個別の列になり、`text`フィールドはリストになりました。その代わり
+各文を個別にトークン化する場合は、リストを文字列に変換して、それらをまとめてトークン化できるようにします。
+
+以下は、各例の文字列のリストを結合し、結果をトークン化する最初の前処理関数です。
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+この前処理関数をデータセット全体に適用するには、🤗 Datasets [`~datasets.Dataset.map`] メソッドを使用します。 `map` 関数を高速化するには、`batched=True` を設定してデータセットの複数の要素を一度に処理し、`num_proc` でプロセスの数を増やします。不要な列を削除します。
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+このデータセットにはトークン シーケンスが含まれていますが、その一部はモデルの最大入力長よりも長くなります。
+
+2 番目の前処理関数を使用して、
+- すべてのシーケンスを連結します
+- 連結されたシーケンスを`block_size`で定義された短いチャンクに分割します。これは、最大入力長より短く、GPU RAM に十分な長さである必要があります。
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     result["labels"] = result["input_ids"].copy()
+...     return result
+```
+
+Apply the `group_texts` function over the entire dataset:
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+次に、[`DataCollat​​orForLanguageModeling`] を使用してサンプルのバッチを作成します。 *動的にパディング*する方が効率的です。
+データセット全体を最大長までパディングするのではなく、照合中にバッチ内の文を最長の長さにします。
+
+<frameworkcontent>
+<pt>
+
+シーケンス終了トークンをパディング トークンとして使用し、`mlm=False` を設定します。これは、入力を 1 要素分右にシフトしたラベルとして使用します。
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+```
+
+</pt>
+<tf>
+シーケンス終了トークンをパディング トークンとして使用し、`mlm=False` を設定します。これは、入力を 1 要素分右にシフトしたラベルとして使用します。
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
+```
+
+</tf>
+</frameworkcontent>
+
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[基本チュートリアル](../training#train-with-pytorch-trainer) を参照してください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForCausalLM`] を使用して DistilGPT2 をロードします。
+
+
+```py
+>>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
+
+>>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。
+2. トレーニング引数をモデル、データセット、データ照合器とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_clm-model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.evaluate`] メソッドを使用してモデルを評価し、その複雑さを取得します。
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 49.61
+```
+
+次に、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[基本チュートリアル](../training#train-a-tensorflow-model-with-keras) をご覧ください。
+
+</Tip>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+次に、[`TFAutoModelForCausalLM`] を使用して DistilGPT2 をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("distilgpt2")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+これは、モデルとトークナイザーを [`~transformers.PushToHubCallback`] でプッシュする場所を指定することで実行できます。
+
+
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_clm-model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+因果言語モデリング用にモデルを微調整する方法のより詳細な例については、対応するドキュメントを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)。
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+テキストを生成するプロンプトを考え出します。
+
+```py
+>>> prompt = "Somatic hypermutation allows the immune system to"
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用してテキスト生成用の`pipeline`をインスタンス化し、それにテキストを渡します。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> generator = pipeline("text-generation", model="my_awesome_eli5_clm-model")
+>>> generator(prompt)
+[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
+```
+
+<frameworkcontent>
+<pt>
+
+
+テキストをトークン化し、「input_ids」を PyTorch テンソルとして返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用してテキストを生成します。
+さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[テキスト生成戦略](../generation_strategies) ページを参照してください。
+
+```py
+>>> from transformers import AutoModelForCausalLM
+
+>>> model = AutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
+```
+
+</pt>
+<tf>
+
+テキストをトークン化し、`input_ids`を TensorFlow テンソルとして返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_eli5_clm-model")
+>>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[テキスト生成戦略](../generation_strategies) ページを参照してください。
+
+```py
+>>> from transformers import TFAutoModelForCausalLM
+
+>>> model = TFAutoModelForCausalLM.from_pretrained("my_awesome_eli5_clm-model")
+>>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+```py
+>>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
+['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
+```
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/masked_language_modeling.md b/docs/source/ja/tasks/masked_language_modeling.md
new file mode 100644
index 00000000000000..3cf6db70f2e9d6
--- /dev/null
+++ b/docs/source/ja/tasks/masked_language_modeling.md
@@ -0,0 +1,455 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Masked language modeling
+
+[[open-in-colab]]
+
+<Youtube id="mqElG5QJWUg"/>
+
+マスクされた言語モデリングはシーケンス内のマスクされたトークンを予測し、モデルはトークンを双方向に処理できます。これ
+これは、モデルが左右のトークンに完全にアクセスできることを意味します。マスクされた言語モデリングは、次のようなタスクに最適です。
+シーケンス全体の文脈をよく理解する必要があります。 BERT はマスクされた言語モデルの一例です。
+
+このガイドでは、次の方法を説明します。
+
+1. [ELI5](https://huggingface.co/distilroberta-base) の [r/askscience](https://www.reddit.com/r/askscience/) サブセットで [DistilRoBERTa](https://huggingface.co/distilroberta-base) を微調整します。 ://huggingface.co/datasets/eli5) データセット。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このガイドと同じ手順に従って、マスクされた言語モデリング用に他のアーキテクチャを微調整できます。
+次のアーキテクチャのいずれかを選択します。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ESM](../model_doc/esm), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [Perceiver](../model_doc/perceiver), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [TAPAS](../model_doc/tapas), [Wav2Vec2](../model_doc/wav2vec2), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load ELI5 dataset
+
+まず、ELI5 データセットの r/askscience サブセットの小さいサブセットを 🤗 データセット ライブラリからロードします。これで
+データセット全体のトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が与えられます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> eli5 = load_dataset("eli5", split="train_asks[:5000]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train_asks` をトレイン セットとテスト セットに分割します。
+
+```py
+>>> eli5 = eli5.train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> eli5["train"][0]
+{'answers': {'a_id': ['c3d1aib', 'c3d4lya'],
+  'score': [6, 3],
+  'text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+   "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"]},
+ 'answers_urls': {'url': []},
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls': {'url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg']},
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls': {'url': []}}
+```
+
+これは多くのことのように見えるかもしれませんが、実際に関心があるのは`text`フィールドだけです。言語モデリング タスクの優れた点は、次の単語がラベル * であるため、ラベル (教師なしタスクとも呼ばれます) が必要ないことです。
+
+## Preprocess
+
+<Youtube id="8PmhEIXhBvI"/>
+
+マスクされた言語モデリングの場合、次のステップは、`text`サブフィールドを処理するために DistilRoBERTa トークナイザーをロードすることです。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
+```
+
+上の例からわかるように、`text`フィールドは実際には`answers`内にネストされています。これは、次のことを行う必要があることを意味します
+[` flatten`](https://huggingface.co/docs/datasets/process.html#flatten) メソッドを使用して、ネストされた構造から `text` サブフィールドを抽出します。
+
+```py
+>>> eli5 = eli5.flatten()
+>>> eli5["train"][0]
+{'answers.a_id': ['c3d1aib', 'c3d4lya'],
+ 'answers.score': [6, 3],
+ 'answers.text': ["The velocity needed to remain in orbit is equal to the square root of Newton's constant times the mass of earth divided by the distance from the center of the earth. I don't know the altitude of that specific mission, but they're usually around 300 km. That means he's going 7-8 km/s.\n\nIn space there are no other forces acting on either the shuttle or the guy, so they stay in the same position relative to each other. If he were to become unable to return to the ship, he would presumably run out of oxygen, or slowly fall into the atmosphere and burn up.",
+  "Hope you don't mind me asking another question, but why aren't there any stars visible in this photo?"],
+ 'answers_urls.url': [],
+ 'document': '',
+ 'q_id': 'nyxfp',
+ 'selftext': '_URL_0_\n\nThis was on the front page earlier and I have a few questions about it. Is it possible to calculate how fast the astronaut would be orbiting the earth? Also how does he stay close to the shuttle so that he can return safely, i.e is he orbiting at the same speed and can therefore stay next to it? And finally if his propulsion system failed, would he eventually re-enter the atmosphere and presumably die?',
+ 'selftext_urls.url': ['http://apod.nasa.gov/apod/image/1201/freeflyer_nasa_3000.jpg'],
+ 'subreddit': 'askscience',
+ 'title': 'Few questions about this space walk photograph.',
+ 'title_urls.url': []}
+```
+
+`answers`接頭辞で示されるように、各サブフィールドは個別の列になり、`text`フィールドはリストになりました。その代わり
+各文を個別にトークン化する場合は、リストを文字列に変換して、それらをまとめてトークン化できるようにします。
+
+以下は、各例の文字列のリストを結合し、結果をトークン化する最初の前処理関数です。
+
+```py
+>>> def preprocess_function(examples):
+...     return tokenizer([" ".join(x) for x in examples["answers.text"]])
+```
+
+この前処理関数をデータセット全体に適用するには、🤗 Datasets [`~datasets.Dataset.map`] メソッドを使用します。 `map` 関数を高速化するには、`batched=True` を設定してデータセットの複数の要素を一度に処理し、`num_proc` でプロセスの数を増やします。不要な列を削除します。
+
+```py
+>>> tokenized_eli5 = eli5.map(
+...     preprocess_function,
+...     batched=True,
+...     num_proc=4,
+...     remove_columns=eli5["train"].column_names,
+... )
+```
+
+このデータセットにはトークン シーケンスが含まれていますが、その一部はモデルの最大入力長よりも長くなります。
+
+2 番目の前処理関数を使用して、
+- すべてのシーケンスを連結します
+- 連結されたシーケンスを`block_size`で定義された短いチャンクに分割します。これは、最大入力長より短く、GPU RAM に十分な長さである必要があります。
+
+```py
+>>> block_size = 128
+
+
+>>> def group_texts(examples):
+...     # Concatenate all texts.
+...     concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+...     total_length = len(concatenated_examples[list(examples.keys())[0]])
+...     # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+...     # customize this part to your needs.
+...     if total_length >= block_size:
+...         total_length = (total_length // block_size) * block_size
+...     # Split by chunks of block_size.
+...     result = {
+...         k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
+...         for k, t in concatenated_examples.items()
+...     }
+...     return result
+```
+
+データセット全体に`group_texts`関数を適用します。
+
+```py
+>>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
+```
+
+次に、[`DataCollat​​orForLanguageModeling`] を使用してサンプルのバッチを作成します。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
+
+<frameworkcontent>
+<pt>
+
+シーケンス終了トークンをパディング トークンとして使用し、データを反復するたびにランダムにトークンをマスクするために `mlm_probability` を指定します。
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> tokenizer.pad_token = tokenizer.eos_token
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
+```
+</pt>
+<tf>
+
+シーケンス終了トークンをパディング トークンとして使用し、データを反復するたびにランダムにトークンをマスクするために `mlm_probability` を指定します。
+
+
+```py
+>>> from transformers import DataCollatorForLanguageModeling
+
+>>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForMaskedLM`] を使用して DistilRoBERTa をロードします。
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。
+2. トレーニング引数をモデル、データセット、データ照合器とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=lm_dataset["train"],
+...     eval_dataset=lm_dataset["test"],
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.evaluate`] メソッドを使用してモデルを評価し、その複雑さを取得します。
+
+
+```py
+>>> import math
+
+>>> eval_results = trainer.evaluate()
+>>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
+Perplexity: 8.76
+```
+
+次に、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+次に、[`TFAutoModelForMaskedLM`] を使用して DistilRoBERTa をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("distilroberta-base")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     lm_dataset["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     lm_dataset["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+This can be done by specifying where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_eli5_mlm_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+マスクされた言語モデリング用にモデルを微調整する方法のより詳細な例については、対応するドキュメントを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)。
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+モデルに空白を埋めるテキストを考え出し、特別な `<mask>` トークンを使用して空白を示します。
+
+```py
+>>> text = "The Milky Way is a <mask> galaxy."
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用してフィルマスクの`pipeline`をインスタンス化し、テキストをそれに渡します。必要に応じて、`top_k`パラメータを使用して、返す予測の数を指定できます。
+
+```py
+>>> from transformers import pipeline
+
+>>> mask_filler = pipeline("fill-mask", "stevhliu/my_awesome_eli5_mlm_model")
+>>> mask_filler(text, top_k=3)
+[{'score': 0.5150994658470154,
+  'token': 21300,
+  'token_str': ' spiral',
+  'sequence': 'The Milky Way is a spiral galaxy.'},
+ {'score': 0.07087188959121704,
+  'token': 2232,
+  'token_str': ' massive',
+  'sequence': 'The Milky Way is a massive galaxy.'},
+ {'score': 0.06434620916843414,
+  'token': 650,
+  'token_str': ' small',
+  'sequence': 'The Milky Way is a small galaxy.'}]
+```
+
+<frameworkcontent>
+<pt>
+
+テキストをトークン化し、`input_ids`を PyTorch テンソルとして返します。 `<mask>` トークンの位置も指定する必要があります。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+>>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+```
+
+入力をモデルに渡し、マスクされたトークンの`logits`を返します。
+
+```py
+>>> from transformers import AutoModelForMaskedLM
+
+>>> model = AutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+次に、マスクされた 3 つのトークンを最も高い確率で返し、出力します。
+
+```py
+>>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+
+</pt>
+<tf>
+
+テキストをトークン化し、`input_ids`を TensorFlow テンソルとして返します。 `<mask>` トークンの位置も指定する必要があります。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+>>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+```
+
+入力をモデルに渡し、マスクされたトークンの`logits`を返します。
+
+
+```py
+>>> from transformers import TFAutoModelForMaskedLM
+
+>>> model = TFAutoModelForMaskedLM.from_pretrained("stevhliu/my_awesome_eli5_mlm_model")
+>>> logits = model(**inputs).logits
+>>> mask_token_logits = logits[0, mask_token_index, :]
+```
+
+次に、マスクされた 3 つのトークンを最も高い確率で返し、出力します。
+
+
+```py
+>>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
+
+>>> for token in top_3_tokens:
+...     print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
+The Milky Way is a spiral galaxy.
+The Milky Way is a massive galaxy.
+The Milky Way is a small galaxy.
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/monocular_depth_estimation.md b/docs/source/ja/tasks/monocular_depth_estimation.md
new file mode 100644
index 00000000000000..984631fd3d5500
--- /dev/null
+++ b/docs/source/ja/tasks/monocular_depth_estimation.md
@@ -0,0 +1,154 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Monocular depth estimation
+
+単眼奥行き推定は、シーンの奥行き情報を画像から予測することを含むコンピューター ビジョン タスクです。
+単一の画像。言い換えれば、シーン内のオブジェクトの距離を距離から推定するプロセスです。
+単一カメラの視点。
+
+単眼奥行き推定には、3D 再構築、拡張現実、自動運転、
+そしてロボット工学。モデルがオブジェクト間の複雑な関係を理解する必要があるため、これは困難な作業です。
+シーンとそれに対応する深度情報（照明条件などの要因の影響を受ける可能性があります）
+オクルージョンとテクスチャ。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[DPT](../model_doc/dpt), [GLPN](../model_doc/glpn)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+このガイドでは、次の方法を学びます。
+
+* 深度推定パイプラインを作成する
+* 手動で深度推定推論を実行します
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q transformers
+```
+
+## Depth estimation pipeline
+
+深度推定をサポートするモデルで推論を試す最も簡単な方法は、対応する [`pipeline`] を使用することです。
+[Hugging Face Hub のチェックポイント](https://huggingface.co/models?pipeline_tag=Depth-estimation&sort=downloads) からパイプラインをインスタンス化します。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+>>> depth_estimator = pipeline("depth-estimation", model=checkpoint)
+```
+
+次に、分析する画像を選択します。
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/HwBAsSbPBDU/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MzR8fGNhciUyMGluJTIwdGhlJTIwc3RyZWV0fGVufDB8MHx8fDE2Nzg5MDEwODg&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-estimation-example.jpg" alt="Photo of a busy street"/>
+</div>
+
+画像をパイプラインに渡します。
+
+```py
+>>> predictions = depth_estimator(image)
+```
+
+パイプラインは 2 つのエントリを含む辞書を返します。最初のものは`predicted_ Depth`と呼ばれ、次の値を持つテンソルです。
+深さは各ピクセルのメートル単位で表されます。
+2 番目の`depth`は、深度推定結果を視覚化する PIL 画像です。
+
+視覚化された結果を見てみましょう。
+
+```py
+>>> predictions["depth"]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
+
+## Depth estimation inference by hand
+
+深度推定パイプラインの使用方法を理解したので、同じ結果を手動で複製する方法を見てみましょう。
+
+まず、[Hugging Face Hub のチェックポイント](https://huggingface.co/models?pipeline_tag=Depth-estimation&sort=downloads) からモデルと関連プロセッサをロードします。
+ここでは、前と同じチェックポイントを使用します。
+
+
+```py
+>>> from transformers import AutoImageProcessor, AutoModelForDepthEstimation
+
+>>> checkpoint = "vinvino02/glpn-nyu"
+
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+>>> model = AutoModelForDepthEstimation.from_pretrained(checkpoint)
+```
+
+必要な画像変換を処理する`image_processor`を使用して、モデルの画像入力を準備します。
+サイズ変更や正規化など:
+
+```py
+>>> pixel_values = image_processor(image, return_tensors="pt").pixel_values
+```
+
+準備された入力をモデルに渡します。
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(pixel_values)
+...     predicted_depth = outputs.predicted_depth
+```
+
+結果を視覚化します。
+
+
+```py
+>>> import numpy as np
+
+>>> # interpolate to original size
+>>> prediction = torch.nn.functional.interpolate(
+...     predicted_depth.unsqueeze(1),
+...     size=image.size[::-1],
+...     mode="bicubic",
+...     align_corners=False,
+... ).squeeze()
+>>> output = prediction.numpy()
+
+>>> formatted = (output * 255 / np.max(output)).astype("uint8")
+>>> depth = Image.fromarray(formatted)
+>>> depth
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization.png" alt="Depth estimation visualization"/>
+</div>
diff --git a/docs/source/ja/tasks/multiple_choice.md b/docs/source/ja/tasks/multiple_choice.md
new file mode 100644
index 00000000000000..6b634710550be6
--- /dev/null
+++ b/docs/source/ja/tasks/multiple_choice.md
@@ -0,0 +1,470 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Multiple choice
+
+[[open-in-colab]]
+
+多肢選択タスクは質問応答に似ていますが、いくつかの候補の回答がコンテキストとともに提供され、正しい回答を選択するようにモデルがトレーニングされる点が異なります。
+
+このガイドでは、次の方法を説明します。
+
+1. [SWAG](https://huggingface.co/datasets/swag) データセットの「通常」構成で [BERT](https://huggingface.co/bert-base-uncased) を微調整して、最適なデータセットを選択します複数の選択肢と何らかのコンテキストを考慮して回答します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [I-BERT](../model_doc/ibert), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SWAG dataset
+
+まず、🤗 データセット ライブラリから SWAG データセットの「通常」構成をロードします。
+
+```py
+>>> from datasets import load_dataset
+
+>>> swag = load_dataset("swag", "regular")
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> swag["train"][0]
+{'ending0': 'passes by walking down the street playing their instruments.',
+ 'ending1': 'has heard approaching them.',
+ 'ending2': "arrives and they're outside dancing and asleep.",
+ 'ending3': 'turns the lead singer watches the performance.',
+ 'fold-ind': '3416',
+ 'gold-source': 'gold',
+ 'label': 0,
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
+ 'sent2': 'A drum line',
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
+ 'video-id': 'anetv_jkn6uvmqwh4'}
+```
+
+ここにはたくさんのフィールドがあるように見えますが、実際は非常に簡単です。
+
+- `sent1` と `sent2`: これらのフィールドは文の始まりを示し、この 2 つを組み合わせると `startphrase` フィールドが得られます。
+- `ending`: 文の終わり方として考えられる終わり方を示唆しますが、正しいのは 1 つだけです。
+- `label`: 正しい文の終わりを識別します。
+
+## Preprocess
+
+次のステップでは、BERT トークナイザーをロードして、文の始まりと 4 つの可能な終わりを処理します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+```
+
+作成する前処理関数は次のことを行う必要があります。
+
+1. `sent1` フィールドのコピーを 4 つ作成し、それぞれを `sent2` と組み合わせて文の始まりを再現します。
+2. `sent2` を 4 つの可能な文末尾のそれぞれと組み合わせます。
+3. これら 2 つのリストをトークン化できるようにフラット化し、その後、各例に対応する `input_ids`、`attention_mask`、および `labels` フィールドが含まれるように非フラット化します。
+
+```py
+>>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
+
+
+>>> def preprocess_function(examples):
+...     first_sentences = [[context] * 4 for context in examples["sent1"]]
+...     question_headers = examples["sent2"]
+...     second_sentences = [
+...         [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
+...     ]
+
+...     first_sentences = sum(first_sentences, [])
+...     second_sentences = sum(second_sentences, [])
+
+...     tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
+...     return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] メソッドを使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` 関数を高速化できます。
+
+
+```py
+tokenized_swag = swag.map(preprocess_function, batched=True)
+```
+
+🤗 Transformers には多肢選択用のデータ照合器がないため、[`DataCollat​​orWithPadding`] を調整してサンプルのバッチを作成する必要があります。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
+
+`DataCollat​​orForMultipleChoice` は、すべてのモデル入力を平坦化し、パディングを適用して、結果を非平坦化します。
+
+<frameworkcontent>
+<pt>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import torch
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="pt",
+...         )
+
+...         batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
+...         batch["labels"] = torch.tensor(labels, dtype=torch.int64)
+...         return batch
+```
+</pt>
+<tf>
+```py
+>>> from dataclasses import dataclass
+>>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
+>>> from typing import Optional, Union
+>>> import tensorflow as tf
+
+
+>>> @dataclass
+... class DataCollatorForMultipleChoice:
+...     """
+...     Data collator that will dynamically pad the inputs for multiple choice received.
+...     """
+
+...     tokenizer: PreTrainedTokenizerBase
+...     padding: Union[bool, str, PaddingStrategy] = True
+...     max_length: Optional[int] = None
+...     pad_to_multiple_of: Optional[int] = None
+
+...     def __call__(self, features):
+...         label_name = "label" if "label" in features[0].keys() else "labels"
+...         labels = [feature.pop(label_name) for feature in features]
+...         batch_size = len(features)
+...         num_choices = len(features[0]["input_ids"])
+...         flattened_features = [
+...             [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
+...         ]
+...         flattened_features = sum(flattened_features, [])
+
+...         batch = self.tokenizer.pad(
+...             flattened_features,
+...             padding=self.padding,
+...             max_length=self.max_length,
+...             pad_to_multiple_of=self.pad_to_multiple_of,
+...             return_tensors="tf",
+...         )
+
+...         batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
+...         batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
+...         return batch
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[accuracy](https://huggingface.co/spaces/evaluate-metric/accuracy) メトリクスを読み込みます (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してください) ) メトリクスの読み込みと計算方法の詳細については、次を参照してください)。
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して精度を計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     predictions = np.argmax(predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=labels)
+```
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForMultipleChoice`] を使用して BERT をロードします。
+
+```py
+>>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は精度を評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_swag_model",
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_swag["train"],
+...     eval_dataset=tokenized_swag["validation"],
+...     tokenizer=tokenizer,
+...     data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できますように。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 2
+>>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
+>>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
+```
+
+次に、[`TFAutoModelForMultipleChoice`] を使用して BERT をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("bert-base-uncased")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_swag["train"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_swag["validation"],
+...     shuffle=False,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+トレーニングを開始する前にセットアップする最後の 2 つのことは、予測から精度を計算することと、モデルをハブにプッシュする方法を提供することです。どちらも [Keras コールバック](../main_classes/keras_callbacks) を使用して行われます。
+
+`compute_metrics` 関数を [`~transformers.KerasMetricCallback`] に渡します。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`] でモデルとトークナイザーをプッシュする場所を指定します。
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+次に、コールバックをまとめてバンドルします。
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+
+<Tip>
+
+複数選択用にモデルを微調整する方法の詳細な例については、対応するセクションを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)。
+
+</Tip>
+
+
+# Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+いくつかのテキストと 2 つの回答候補を考えてください。
+
+```py
+>>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
+>>> candidate1 = "The law does not apply to croissants and brioche."
+>>> candidate2 = "The law applies to baguettes."
+```
+
+<frameworkcontent>
+<pt>
+
+各プロンプトと回答候補のペアをトークン化し、PyTorch テンソルを返します。いくつかの`lables`も作成する必要があります。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
+>>> labels = torch.tensor(0).unsqueeze(0)
+```
+
+入力とラベルをモデルに渡し、`logits`を返します。
+
+```py
+>>> from transformers import AutoModelForMultipleChoice
+
+>>> model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
+>>> logits = outputs.logits
+```
+
+最も高い確率でクラスを取得します。
+
+```py
+>>> predicted_class = logits.argmax().item()
+>>> predicted_class
+'0'
+```
+</pt>
+<tf>
+
+各プロンプトと回答候補のペアをトークン化し、TensorFlow テンソルを返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model")
+>>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+```py
+>>> from transformers import TFAutoModelForMultipleChoice
+
+>>> model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model")
+>>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
+>>> outputs = model(inputs)
+>>> logits = outputs.logits
+```
+
+最も高い確率でクラスを取得します。
+
+```py
+>>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
+>>> predicted_class
+'0'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md
new file mode 100644
index 00000000000000..389e7bdf2f455e
--- /dev/null
+++ b/docs/source/ja/tasks/object_detection.md
@@ -0,0 +1,603 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Object detection
+
+[[open-in-colab]]
+
+オブジェクト検出は、画像内のインスタンス (人間、建物、車など) を検出するコンピューター ビジョン タスクです。物体検出モデルは画像を入力および出力として受け取ります
+検出されたオブジェクトの境界ボックスと関連するラベルの座標。画像には複数のオブジェクトを含めることができます。
+それぞれに独自の境界ボックスとラベルがあり (例: 車と建物を持つことができます)、各オブジェクトは
+画像のさまざまな部分に存在する必要があります (たとえば、画像には複数の車が含まれている可能性があります)。
+このタスクは、歩行者、道路標識、信号機などを検出するために自動運転で一般的に使用されます。
+他のアプリケーションには、画像内のオブジェクトのカウント、画像検索などが含まれます。
+
+このガイドでは、次の方法を学習します。
+
+ 1. Finetune [DETR](https://huggingface.co/docs/transformers/model_doc/detr)、畳み込みアルゴリズムを組み合わせたモデル
+ [CPPE-5](https://huggingface.co/datasets/cppe-5) 上のエンコーダー/デコーダー トランスフォーマーを備えたバックボーン
+ データセット。
+ 2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[Conditional DETR](../model_doc/conditional_detr), [Deformable DETR](../model_doc/deformable_detr), [DETA](../model_doc/deta), [DETR](../model_doc/detr), [Table Transformer](../model_doc/table-transformer), [YOLOS](../model_doc/yolos)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+
+```bash
+pip install -q datasets transformers evaluate timm albumentations
+```
+
+🤗 データセットを使用して Hugging Face Hub からデータセットをロードし、🤗 トランスフォーマーを使用してモデルをトレーニングします。
+データを増強するための`albumentations`。 `timm` は現在、DETR モデルの畳み込みバックボーンをロードするために必要です。
+
+モデルをコミュニティと共有することをお勧めします。 Hugging Face アカウントにログインして、ハブにアップロードします。
+プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load the CPPE-5 dataset
+
+[CPPE-5 データセット](https://huggingface.co/datasets/cppe-5) には、次の画像が含まれています。
+新型コロナウイルス感染症のパンデミックにおける医療用個人保護具 (PPE) を識別する注釈。
+
+データセットをロードすることから始めます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> cppe5 = load_dataset("cppe-5")
+>>> cppe5
+DatasetDict({
+    train: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 1000
+    })
+    test: Dataset({
+        features: ['image_id', 'image', 'width', 'height', 'objects'],
+        num_rows: 29
+    })
+})
+```
+
+このデータセットには、1000 枚の画像を含むトレーニング セットと 29 枚の画像を含むテスト セットがすでに付属していることがわかります。
+
+データに慣れるために、例がどのようなものかを調べてください。
+
+```py
+>>> cppe5["train"][0]
+{'image_id': 15,
+ 'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=943x663 at 0x7F9EC9E77C10>,
+ 'width': 943,
+ 'height': 663,
+ 'objects': {'id': [114, 115, 116, 117],
+  'area': [3796, 1596, 152768, 81002],
+  'bbox': [[302.0, 109.0, 73.0, 52.0],
+   [810.0, 100.0, 57.0, 28.0],
+   [160.0, 31.0, 248.0, 616.0],
+   [741.0, 68.0, 202.0, 401.0]],
+  'category': [4, 4, 0, 0]}}
+```
+
+データセット内の例には次のフィールドがあります。
+- `image_id`: サンプルの画像ID
+- `image`: 画像を含む `PIL.Image.Image` オブジェクト
+- `width`: 画像の幅
+- `height`: 画像の高さ
+- `objects`: 画像内のオブジェクトの境界ボックスのメタデータを含む辞書:
+  - `id`: アノテーションID
+  - `area`: 境界ボックスの領域
+  - `bbox`: オブジェクトの境界ボックス ([COCO 形式](https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/#coco) )
+  - `category`: オブジェクトのカテゴリー。可能な値には、`Coverall (0)`、`Face_Shield (1)`、`Gloves (2)`、`Goggles (3)`、および `Mask (4)` が含まれます。
+
+`bbox`フィールドが COCO 形式に従っていることに気づくかもしれません。これは DETR モデルが予期する形式です。
+ただし、「オブジェクト」内のフィールドのグループ化は、DETR が必要とする注釈形式とは異なります。あなたはするであろう
+このデータをトレーニングに使用する前に、いくつかの前処理変換を適用する必要があります。
+
+データをさらに深く理解するには、データセット内の例を視覚化します。
+
+```py
+>>> import numpy as np
+>>> import os
+>>> from PIL import Image, ImageDraw
+
+>>> image = cppe5["train"][0]["image"]
+>>> annotations = cppe5["train"][0]["objects"]
+>>> draw = ImageDraw.Draw(image)
+
+>>> categories = cppe5["train"].features["objects"].feature["category"].names
+
+>>> id2label = {index: x for index, x in enumerate(categories, start=0)}
+>>> label2id = {v: k for k, v in id2label.items()}
+
+>>> for i in range(len(annotations["id"])):
+...     box = annotations["bbox"][i]
+...     class_idx = annotations["category"][i]
+...     x, y, w, h = tuple(box)
+...     draw.rectangle((x, y, x + w, y + h), outline="red", width=1)
+...     draw.text((x, y), id2label[class_idx], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/TdaqPJO.png" alt="CPPE-5 Image Example"/>
+</div>
+
+関連付けられたラベルを使用して境界ボックスを視覚化するには、データセットのメタデータからラベルを取得します。
+`category`フィールド。
+また、ラベル ID をラベル クラスにマッピングする辞書 (`id2label`) やその逆 (`label2id`) を作成することもできます。
+これらは、後でモデルをセットアップするときに使用できます。これらのマップを含めると、共有した場合に他の人がモデルを再利用できるようになります。
+ハグフェイスハブに取り付けます。
+
+データに慣れるための最後のステップとして、潜在的な問題がないかデータを調査します。データセットに関する一般的な問題の 1 つは、
+オブジェクト検出は、画像の端を越えて「伸びる」境界ボックスです。このような「暴走」境界ボックスは、
+トレーニング中にエラーが発生するため、この段階で対処する必要があります。このデータセットには、この問題に関する例がいくつかあります。
+このガイドでは内容をわかりやすくするために、これらの画像をデータから削除します。
+
+```py
+>>> remove_idx = [590, 821, 822, 875, 876, 878, 879]
+>>> keep = [i for i in range(len(cppe5["train"])) if i not in remove_idx]
+>>> cppe5["train"] = cppe5["train"].select(keep)
+```
+
+## Preprocess the data
+
+モデルを微調整するには、事前トレーニングされたモデルに使用されるアプローチと正確に一致するように、使用する予定のデータを前処理する必要があります。
+[`AutoImageProcessor`] は、画像データを処理して `pixel_values`、`pixel_mask`、および
+DETR モデルをトレーニングできる「ラベル」。画像プロセッサには、心配する必要のないいくつかの属性があります。
+
+- `image_mean = [0.485, 0.456, 0.406 ]`
+- `image_std = [0.229, 0.224, 0.225]`
+
+これらは、モデルの事前トレーニング中に画像を正規化するために使用される平均と標準偏差です。これらの価値観は非常に重要です
+事前にトレーニングされた画像モデルを推論または微調整するときに複製します。
+
+微調整するモデルと同じチェックポイントからイメージ プロセッサをインスタンス化します。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "facebook/detr-resnet-50"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint)
+```
+
+画像を`image_processor`に渡す前に、2 つの前処理変換をデータセットに適用します。
+- 画像の拡張
+- DETR の期待に応えるための注釈の再フォーマット
+
+まず、モデルがトレーニング データにオーバーフィットしないようにするために、任意のデータ拡張ライブラリを使用して画像拡張を適用できます。ここでは[Albumentations](https://albumentations.ai/docs/)を使用します...
+このライブラリは、変換が画像に影響を与え、それに応じて境界ボックスを更新することを保証します。
+🤗 データセット ライブラリのドキュメントには、詳細な [物体検出用に画像を拡張する方法に関するガイド](https://huggingface.co/docs/datasets/object_detection) が記載されています。
+例としてまったく同じデータセットを使用しています。ここでも同じアプローチを適用し、各画像のサイズを (480, 480) に変更します。
+水平に反転して明るくします。
+
+```py
+>>> import albumentations
+>>> import numpy as np
+>>> import torch
+
+>>> transform = albumentations.Compose(
+...     [
+...         albumentations.Resize(480, 480),
+...         albumentations.HorizontalFlip(p=1.0),
+...         albumentations.RandomBrightnessContrast(p=1.0),
+...     ],
+...     bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]),
+... )
+```
+
+`image_processor` は、注釈が次の形式であることを期待します: `{'image_id': int, 'annotations': List[Dict]}`,
+ ここで、各辞書は COCO オブジェクトの注釈です。 1 つの例として、注釈を再フォーマットする関数を追加してみましょう。
+
+ ```py
+>>> def formatted_anns(image_id, category, area, bbox):
+...     annotations = []
+...     for i in range(0, len(category)):
+...         new_ann = {
+...             "image_id": image_id,
+...             "category_id": category[i],
+...             "isCrowd": 0,
+...             "area": area[i],
+...             "bbox": list(bbox[i]),
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+```
+
+これで、画像と注釈の変換を組み合わせてサンプルのバッチで使用できるようになりました。
+
+```py
+>>> # transforming a batch
+>>> def transform_aug_ann(examples):
+...     image_ids = examples["image_id"]
+...     images, bboxes, area, categories = [], [], [], []
+...     for image, objects in zip(examples["image"], examples["objects"]):
+...         image = np.array(image.convert("RGB"))[:, :, ::-1]
+...         out = transform(image=image, bboxes=objects["bbox"], category=objects["category"])
+
+...         area.append(objects["area"])
+...         images.append(out["image"])
+...         bboxes.append(out["bboxes"])
+...         categories.append(out["category"])
+
+...     targets = [
+...         {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)}
+...         for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes)
+...     ]
+
+...     return image_processor(images=images, annotations=targets, return_tensors="pt")
+```
+
+🤗 Datasets [`~datasets.Dataset.with_transform`] メソッドを使用して、この前処理関数をデータセット全体に適用します。この方法が適用されるのは、
+データセットの要素を読み込むときに、その場で変換します。
+
+この時点で、データセットの例が変換後にどのようになるかを確認できます。テンソルが表示されるはずです
+`pixel_values`、テンソルと `pixel_mask`、および `labels` を使用します。
+
+```py
+>>> cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann)
+>>> cppe5["train"][15]
+{'pixel_values': tensor([[[ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9809, -1.9809, -1.9809],
+          [ 0.9132,  0.9132,  0.9132,  ..., -1.9638, -1.9638, -1.9638],
+          ...,
+          [-1.5699, -1.5699, -1.5699,  ..., -1.9980, -1.9980, -1.9980],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809],
+          [-1.5528, -1.5528, -1.5528,  ..., -1.9980, -1.9809, -1.9809]],
+
+         [[ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8431, -1.8431, -1.8431],
+          [ 1.3081,  1.3081,  1.3081,  ..., -1.8256, -1.8256, -1.8256],
+          ...,
+          [-1.3179, -1.3179, -1.3179,  ..., -1.8606, -1.8606, -1.8606],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431],
+          [-1.3004, -1.3004, -1.3004,  ..., -1.8606, -1.8431, -1.8431]],
+
+         [[ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6476, -1.6476, -1.6476],
+          [ 1.4200,  1.4200,  1.4200,  ..., -1.6302, -1.6302, -1.6302],
+          ...,
+          [-1.0201, -1.0201, -1.0201,  ..., -1.5604, -1.5604, -1.5604],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430],
+          [-1.0027, -1.0027, -1.0027,  ..., -1.5604, -1.5430, -1.5430]]]),
+ 'pixel_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         ...,
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1],
+         [1, 1, 1,  ..., 1, 1, 1]]),
+ 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}}
+```
+
+個々の画像を正常に拡張し、それらの注釈を準備しました。ただし、前処理はそうではありません。
+まだ完成しています。最後のステップでは、画像をバッチ処理するためのカスタム `collat​​e_fn` を作成します。
+画像 (現在は `pixel_values`) をバッチ内の最大の画像にパディングし、対応する `pixel_mask` を作成します
+どのピクセルが実数 (1) で、どのピクセルがパディング (0) であるかを示します。
+
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## Training the DETR model
+
+前のセクションで重労働のほとんどを完了したので、モデルをトレーニングする準備が整いました。
+このデータセット内の画像は、サイズを変更した後でも依然として非常に大きいです。これは、このモデルを微調整すると、
+少なくとも 1 つの GPU が必要です。
+
+トレーニングには次の手順が含まれます。
+1. 前処理と同じチェックポイントを使用して、[`AutoModelForObjectDetection`] でモデルを読み込みます。
+2. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。
+3. トレーニング引数をモデル、データセット、画像プロセッサ、データ照合器とともに [`Trainer`] に渡します。
+4. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+前処理に使用したのと同じチェックポイントからモデルをロードするときは、必ず`label2id`を渡してください。
+および `id2label` マップは、以前にデータセットのメタデータから作成したものです。さらに、`ignore_mismatched_sizes=True`を指定して、既存の分類頭部を新しい分類頭部に置き換えます。
+
+```py
+>>> from transformers import AutoModelForObjectDetection
+
+>>> model = AutoModelForObjectDetection.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+...     ignore_mismatched_sizes=True,
+... )
+```
+
+[`TrainingArguments`] で、`output_dir` を使用してモデルの保存場所を指定し、必要に応じてハイパーパラメーターを構成します。
+画像列が削除されるため、未使用の列を削除しないことが重要です。画像列がないと、
+`pixel_values` を作成できません。このため、`remove_unused_columns`を`False`に設定します。
+ハブにプッシュしてモデルを共有したい場合は、`push_to_hub` を `True` に設定します (Hugging にサインインする必要があります)
+顔に向かってモデルをアップロードします）。
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(
+...     output_dir="detr-resnet-50_finetuned_cppe5",
+...     per_device_train_batch_size=8,
+...     num_train_epochs=10,
+...     fp16=True,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=1e-5,
+...     weight_decay=1e-4,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+最後に、すべてをまとめて、[`~transformers.Trainer.train`] を呼び出します。
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=collate_fn,
+...     train_dataset=cppe5["train"],
+...     tokenizer=image_processor,
+... )
+
+>>> trainer.train()
+```
+
+`training_args`で`push_to_hub`を`True`に設定した場合、トレーニング チェックポイントは
+ハグフェイスハブ。トレーニングが完了したら、[`~transformers.Trainer.push_to_hub`] メソッドを呼び出して、最終モデルもハブにプッシュします。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Evaluate
+
+物体検出モデルは通常、一連の <a href="https://cocodataset.org/#detection-eval">COCO スタイルの指標</a>を使用して評価されます。
+既存のメトリクス実装のいずれかを使用できますが、ここでは`torchvision`のメトリクス実装を使用して最終的なメトリクスを評価します。
+ハブにプッシュしたモデル。
+
+`torchvision`エバリュエーターを使用するには、グラウンド トゥルース COCO データセットを準備する必要があります。 COCO データセットを構築するための API
+データを特定の形式で保存する必要があるため、最初に画像と注釈をディスクに保存する必要があります。と同じように
+トレーニング用にデータを準備するとき、`cppe5["test"]` からの注釈をフォーマットする必要があります。ただし、画像
+そのままでいるべきです。
+
+評価ステップには少し作業が必要ですが、大きく 3 つのステップに分けることができます。
+まず、`cppe5["test"]` セットを準備します。注釈をフォーマットし、データをディスクに保存します。
+
+
+```py
+>>> import json
+
+
+>>> # format annotations the same as for training, no need for data augmentation
+>>> def val_formatted_anns(image_id, objects):
+...     annotations = []
+...     for i in range(0, len(objects["id"])):
+...         new_ann = {
+...             "id": objects["id"][i],
+...             "category_id": objects["category"][i],
+...             "iscrowd": 0,
+...             "image_id": image_id,
+...             "area": objects["area"][i],
+...             "bbox": objects["bbox"][i],
+...         }
+...         annotations.append(new_ann)
+
+...     return annotations
+
+
+>>> # Save images and annotations into the files torchvision.datasets.CocoDetection expects
+>>> def save_cppe5_annotation_file_images(cppe5):
+...     output_json = {}
+...     path_output_cppe5 = f"{os.getcwd()}/cppe5/"
+
+...     if not os.path.exists(path_output_cppe5):
+...         os.makedirs(path_output_cppe5)
+
+...     path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json")
+...     categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label]
+...     output_json["images"] = []
+...     output_json["annotations"] = []
+...     for example in cppe5:
+...         ann = val_formatted_anns(example["image_id"], example["objects"])
+...         output_json["images"].append(
+...             {
+...                 "id": example["image_id"],
+...                 "width": example["image"].width,
+...                 "height": example["image"].height,
+...                 "file_name": f"{example['image_id']}.png",
+...             }
+...         )
+...         output_json["annotations"].extend(ann)
+...     output_json["categories"] = categories_json
+
+...     with open(path_anno, "w") as file:
+...         json.dump(output_json, file, ensure_ascii=False, indent=4)
+
+...     for im, img_id in zip(cppe5["image"], cppe5["image_id"]):
+...         path_img = os.path.join(path_output_cppe5, f"{img_id}.png")
+...         im.save(path_img)
+
+...     return path_output_cppe5, path_anno
+```
+
+次に、`cocoevaluator`で利用できる`CocoDetection`クラスのインスタンスを用意します。
+
+
+```py
+>>> import torchvision
+
+
+>>> class CocoDetection(torchvision.datasets.CocoDetection):
+...     def __init__(self, img_folder, image_processor, ann_file):
+...         super().__init__(img_folder, ann_file)
+...         self.image_processor = image_processor
+
+...     def __getitem__(self, idx):
+...         # read in PIL image and target in COCO format
+...         img, target = super(CocoDetection, self).__getitem__(idx)
+
+...         # preprocess image and target: converting target to DETR format,
+...         # resizing + normalization of both image and target)
+...         image_id = self.ids[idx]
+...         target = {"image_id": image_id, "annotations": target}
+...         encoding = self.image_processor(images=img, annotations=target, return_tensors="pt")
+...         pixel_values = encoding["pixel_values"].squeeze()  # remove batch dimension
+...         target = encoding["labels"][0]  # remove batch dimension
+
+...         return {"pixel_values": pixel_values, "labels": target}
+
+
+>>> im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"])
+>>> test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno)
+```
+
+最後に、メトリクスをロードして評価を実行します。
+
+```py
+>>> import evaluate
+>>> from tqdm import tqdm
+
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco)
+>>> val_dataloader = torch.utils.data.DataLoader(
+...     test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn
+... )
+
+>>> with torch.no_grad():
+...     for idx, batch in enumerate(tqdm(val_dataloader)):
+...         pixel_values = batch["pixel_values"]
+...         pixel_mask = batch["pixel_mask"]
+
+...         labels = [
+...             {k: v for k, v in t.items()} for t in batch["labels"]
+...         ]  # these are in DETR format, resized + normalized
+
+...         # forward pass
+...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
+
+...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
+
+...         module.add(prediction=results, reference=labels)
+...         del batch
+
+>>> results = module.compute()
+>>> print(results)
+Accumulating evaluation results...
+DONE (t=0.08s).
+IoU metric: bbox
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352
+ Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.681
+ Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208
+ Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.274
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.484
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.501
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323
+ Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590
+```
+
+これらの結果は、[`~transformers.TrainingArguments`] のハイパーパラメータを調整することでさらに改善できます。試してごらん！
+
+## Inference
+
+DETR モデルを微調整して評価し、Hugging Face Hub にアップロードしたので、それを推論に使用できます。
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。パイプラインをインスタンス化する
+モデルを使用してオブジェクトを検出し、それに画像を渡します。
+
+
+```py
+>>> from transformers import pipeline
+>>> import requests
+
+>>> url = "https://i.imgur.com/2lnWoly.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5")
+>>> obj_detector(image)
+```
+
+必要に応じて、パイプラインの結果を手動で複製することもできます。
+
+```py
+>>> image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+>>> model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5")
+
+>>> with torch.no_grad():
+...     inputs = image_processor(images=image, return_tensors="pt")
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([image.size[::-1]])
+...     results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0]
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     print(
+...         f"Detected {model.config.id2label[label.item()]} with confidence "
+...         f"{round(score.item(), 3)} at location {box}"
+...     )
+Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08]
+Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9]
+```
+
+結果をプロットしてみましょう:
+
+```py
+>>> draw = ImageDraw.Draw(image)
+
+>>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+...     box = [round(i, 2) for i in box.tolist()]
+...     x, y, x2, y2 = tuple(box)
+...     draw.rectangle((x, y, x2, y2), outline="red", width=1)
+...     draw.text((x, y), model.config.id2label[label.item()], fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://i.imgur.com/4QZnf9A.png" alt="Object detection result on a new image"/>
+</div>
diff --git a/docs/source/ja/tasks/prompting.md b/docs/source/ja/tasks/prompting.md
new file mode 100644
index 00000000000000..c86a22ec7d0083
--- /dev/null
+++ b/docs/source/ja/tasks/prompting.md
@@ -0,0 +1,438 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# LLM prompting guide
+
+[[open-in-colab]]
+
+Falcon、LLaMA などの大規模言語モデルは、事前にトレーニングされたトランスフォーマー モデルであり、最初は予測するようにトレーニングされています。
+入力テキストが与えられた場合の次のトークン。通常、数十億のパラメータがあり、何兆ものパラメータでトレーニングされています。
+長期間のトークン。その結果、これらのモデルは非常に強力で多用途になり、次のようなことが可能になります。
+自然言語プロンプトでモデルに指示することで、すぐに複数の NLP タスクを解決できます。
+
+最適な出力を保証するためにこのようなプロンプトを設計することは、多くの場合「プロンプト エンジニアリング」と呼ばれます。プロンプトエンジニアリングとは、
+かなりの量の実験を必要とする反復プロセス。自然言語ははるかに柔軟で表現力豊かです
+ただし、プログラミング言語よりもあいまいさが生じる可能性があります。同時に、自然言語によるプロンプト
+変化にはかなり敏感です。プロンプトにわずかな変更を加えただけでも、出力が大幅に異なる場合があります。
+
+すべてのケースに適合するプロンプトを作成するための正確なレシピはありませんが、研究者はいくつかの最良のレシピを考案しました。
+最適な結果をより一貫して達成するのに役立つ実践。
+
+このガイドでは、より優れた LLM プロンプトを作成し、さまざまな NLP タスクを解決するのに役立つプロンプト エンジニアリングのベスト プラクティスについて説明します。
+次のことを学びます:
+
+- [プロンプトの基本](#basic-prompts)
+- [LLM プロンプトのベスト プラクティス](#best-practices-of-llm-prompting)
+- [高度なプロンプト テクニック: 数回のプロンプトと思考の連鎖](#advanced-prompting-techniques)
+- [プロンプトを表示する代わりに微調整する場合](#prompting-vs-fine-tuning)
+
+<Tip>
+
+迅速なエンジニアリングは、LLM 出力最適化プロセスの一部にすぎません。もう 1 つの重要な要素は、
+最適なテキスト生成戦略。 LLM が生成時に後続の各トークンを選択する方法をカスタマイズできます。
+トレーニング可能なパラメータを一切変更せずにテキストを作成します。テキスト生成パラメータを微調整することで、
+生成されたテキストに繰り返しが含まれているため、より一貫性があり人間らしい響きになります。
+テキスト生成戦略とパラメーターはこのガイドの範囲外ですが、これらのトピックについて詳しくは、次のトピックを参照してください。
+次のガイド:
+ 
+* [LLM による生成](../llm_tutorial)
+* [テキスト生成戦略](../generation_strategies)
+
+</Tip>
+
+## Basics of prompting
+
+### Types of models 
+
+最新の LLM の大部分は、デコーダ専用のトランスフォーマーです。例としては、[LLaMA](../model_doc/llama)、
+[Llama2](../model_doc/llama2)、[Falcon](../model_doc/falcon)、[GPT2](../model_doc/gpt2)。ただし、遭遇する可能性があります
+エンコーダ デコーダ トランスフォーマ LLM も同様です。たとえば、[Flan-T5](../model_doc/flan-t5) や [BART](../model_doc/bart) です。
+
+エンコーダ デコーダ スタイルのモデルは通常、出力が入力に**大きく**依存する生成タスクで使用されます。
+たとえば、翻訳と要約です。デコーダ専用モデルは、他のすべてのタイプの生成タスクに使用されます。
+
+パイプラインを使用して LLM でテキストを生成する場合、使用している LLM のタイプを知ることが重要です。
+異なるパイプラインを使用します。
+
+`text-generation`パイプラインを使用してデコーダのみのモデルで推論を実行します。
+
+```python
+>>> from transformers import pipeline
+>>> import torch
+
+>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+
+>>> generator = pipeline('text-generation', model = 'gpt2')
+>>> prompt = "Hello, I'm a language model"
+
+>>> generator(prompt, max_length = 30)
+[{'generated_text': "Hello, I'm a language model expert, so I'm a big believer in the concept that I know very well and then I try to look into"}]
+```
+
+エンコーダー/デコーダーを使用して推論を実行するには、`text2text-generation` パイプラインを使用します。
+
+```python
+>>> text2text_generator = pipeline("text2text-generation", model = 'google/flan-t5-base')
+>>> prompt = "Translate from English to French: I'm very happy to see you"
+
+>>> text2text_generator(prompt)
+[{'generated_text': 'Je suis très heureuse de vous rencontrer.'}]
+```
+
+### Base vs instruct/chat models
+
+🤗 Hub で利用できる最近の LLM チェックポイントのほとんどには、base と instruct (または chat) の 2 つのバージョンがあります。例えば、
+[`tiiuae/falcon-7b`](https://huggingface.co/tiiuae/falcon-7b) および [`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b) -指示する)。
+
+基本モデルは、最初のプロンプトが与えられたときにテキストを完成させるのには優れていますが、NLP タスクには理想的ではありません。
+指示に従う必要がある場合、または会話で使用する場合に使用します。ここで、指示 (チャット) バージョンが登場します。
+これらのチェックポイントは、命令と会話データに基づいて事前トレーニングされたベース バージョンをさらに微調整した結果です。
+この追加の微調整により、多くの NLP タスクにとってより適切な選択肢になります。
+
+[`tiiuae/falcon-7b-instruct`](https://huggingface.co/tiiuae/falcon-7b-instruct) で使用できるいくつかの簡単なプロンプトを示してみましょう。
+いくつかの一般的な NLP タスクを解決します。
+
+### NLP tasks 
+
+まず、環境をセットアップしましょう。
+
+```bash
+pip install -q transformers accelerate
+```
+
+次に、適切なパイプライン (`text_generation`) を使用してモデルをロードしましょう。
+
+```python
+>>> from transformers import pipeline, AutoTokenizer
+>>> import torch
+
+>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+>>> model = "tiiuae/falcon-7b-instruct"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(model)
+>>> pipe = pipeline(
+...     "text-generation",
+...     model=model,
+...     tokenizer=tokenizer,
+...     torch_dtype=torch.bfloat16,
+...     device_map="auto",
+... )
+```
+
+<Tip>
+
+Falcon モデルは `bfloat16` データ型を使用してトレーニングされたため、同じものを使用することをお勧めします。これには、最近の
+CUDA のバージョンに準拠しており、最新のカードで最適に動作します。
+
+</Tip>
+
+パイプライン経由でモデルをロードしたので、プロンプトを使用して NLP タスクを解決する方法を見てみましょう。
+
+#### Text classification
+
+テキスト分類の最も一般的な形式の 1 つはセンチメント分析であり、「ポジティブ」、「ネガティブ」、「ネガティブ」などのラベルを割り当てます。
+または、一連のテキストに対して「中立」です。与えられたテキスト (映画レビュー) を分類するようにモデルに指示するプロンプトを作成してみましょう。
+まず指示を与え、次に分類するテキストを指定します。そのままにしておくのではなく、
+応答の先頭にも追加します - `"Sentiment: "`:
+
+```python
+>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+>>> prompt = """Classify the text into neutral, negative or positive. 
+... Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
+... Sentiment:
+... """
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=10,
+... )
+
+>>> for seq in sequences:
+...     print(f"Result: {seq['generated_text']}")
+Result: Classify the text into neutral, negative or positive. 
+Text: This movie is definitely one of my favorite movies of its kind. The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen.
+Sentiment:
+Positive
+```
+
+その結果、出力には、手順で提供したリストの分類ラベルが含まれており、それは正しいラベルです。
+
+<Tip>
+
+プロンプトに加えて、`max_new_tokens`パラメータを渡していることに気づくかもしれません。トークンの数を制御します。
+モデルが生成します。これは、学習できる多くのテキスト生成パラメーターの 1 つです。
+[テキスト生成戦略](../generation_strategies) ガイドを参照してください。
+
+</Tip>
+
+#### Named Entity Recognition
+
+固有表現認識 (NER) は、テキスト内の人物、場所、組織などの固有表現を検索するタスクです。
+プロンプトの指示を変更して、LLM にこのタスクを実行させましょう。ここでは`return_full_text = False`も設定しましょう
+出力にプロンプ​​トが含​​まれないようにします。
+
+```python
+>>> torch.manual_seed(1) # doctest: +IGNORE_RESULT
+>>> prompt = """Return a list of named entities in the text.
+... Text: The Golden State Warriors are an American professional basketball team based in San Francisco.
+... Named entities:
+... """
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=15,
+...     return_full_text = False,    
+... )
+
+>>> for seq in sequences:
+...     print(f"{seq['generated_text']}")
+- Golden State Warriors
+- San Francisco
+```
+
+ご覧のとおり、モデルは指定されたテキストから 2 つの名前付きエンティティを正しく識別しました。
+
+#### Translation
+
+LLM が実行できるもう 1 つのタスクは翻訳です。このタスクにはエンコーダー/デコーダー モデルを使用することを選択できますが、ここでは
+例を簡単にするために、きちんとした仕事をする Falcon-7b-instruct を使い続けます。もう一度、方法は次のとおりです
+テキストの一部を英語からイタリア語に翻訳するようにモデルに指示する基本的なプロンプトを作成できます。
+
+```python
+>>> torch.manual_seed(2) # doctest: +IGNORE_RESULT
+>>> prompt = """Translate the English text to Italian.
+... Text: Sometimes, I've believed as many as six impossible things before breakfast.
+... Translation:
+... """
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=20,
+...     do_sample=True,
+...     top_k=10,
+...     return_full_text = False,
+... )
+
+>>> for seq in sequences:
+...     print(f"{seq['generated_text']}")
+A volte, ho creduto a sei impossibili cose prima di colazione.
+```
+
+ここでは、出力生成時にモデルがもう少し柔軟になるように `do_sample=True` と `top_k=10` を追加しました。
+
+#### Text summarization
+
+翻訳と同様に、テキストの要約も、出力が入力に**大きく**依存する生成タスクです。
+エンコーダ/デコーダ モデルの方が良い選択になる可能性があります。ただし、デコーダ スタイルのモデルもこのタスクに使用できます。
+以前は、プロンプトの先頭に指示を配置していました。ただし、プロンプトの最後で、
+指示を与えるのに適した場所でもあります。通常、命令はどちらかの端に配置することをお勧めします。
+
+```python
+>>> torch.manual_seed(3) # doctest: +IGNORE_RESULT
+>>> prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change.
+... Write a summary of the above text.
+... Summary:
+... """
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=30,
+...     do_sample=True,
+...     top_k=10,
+...     return_full_text = False,
+... )
+
+>>> for seq in sequences:
+...     print(f"{seq['generated_text']}")
+Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. It is based on traditional knowledge and scientific understanding.
+```
+
+#### Question answering
+
+質問応答タスクの場合、プロンプトを次の論理コンポーネントに構造化できます: 指示、コンテキスト、質問、
+先頭の単語またはフレーズ (`"Answer:"`) を使用して、モデルを操作して答えの生成を開始します。
+
+
+```python
+>>> torch.manual_seed(4) # doctest: +IGNORE_RESULT
+>>> prompt = """Answer the question using the context below.
+... Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. Northern recipes often include cumin and/or pimentón (smoked sweet paprika). Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors.
+... Question: What modern tool is used to make gazpacho?
+... Answer:
+... """
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=10,
+...     do_sample=True,
+...     top_k=10,
+...     return_full_text = False,
+... )
+
+>>> for seq in sequences:
+...     print(f"Result: {seq['generated_text']}")
+Result: Modern tools are used, such as immersion blenders
+```
+
+#### Reasoning
+
+LLM にとって推論は最も困難なタスクの 1 つであり、良い結果を達成するには、多くの場合、次のような高度なプロンプト テクニックを適用する必要があります。
+[Chain-of-thought](#chain-of-thought)。
+
+基本的なプロンプトを使用して、単純な算術タスクに関するモデル推論を作成できるかどうか試してみましょう。
+
+```python
+>>> torch.manual_seed(5) # doctest: +IGNORE_RESULT
+>>> prompt = """There are 5 groups of students in the class. Each group has 4 students. How many students are there in the class?"""
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=30,
+...     do_sample=True,
+...     top_k=10,
+...     return_full_text = False,
+... )
+
+>>> for seq in sequences:
+...     print(f"Result: {seq['generated_text']}")
+Result: 
+There are a total of 5 groups, so there are 5 x 4=20 students in the class.
+```
+
+正しい！もう少し複雑さを増やして、基本的なプロンプトで問題を解決できるかどうかを確認してみましょう。
+
+```python
+>>> torch.manual_seed(6) # doctest: +IGNORE_RESULT
+>>> prompt = """I baked 15 muffins. I ate 2 muffins and gave 5 muffins to a neighbor. My partner then bought 6 more muffins and ate 2. How many muffins do we now have?"""
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=10,
+...     do_sample=True,
+...     top_k=10,
+...     return_full_text = False,
+... )
+
+>>> for seq in sequences:
+...     print(f"Result: {seq['generated_text']}")
+Result: 
+The total number of muffins now is 21
+```
+
+これは間違った答えです。12 である必要があります。この場合、プロンプトが基本的すぎるか、選択内容が原因である可能性があります。
+結局のところ、Falcon の最小バージョンを選択しました。あらゆるサイズのモデルでは推論が困難ですが、より大きなモデルでは
+モデルのパフォーマンスが向上する可能性があります。
+
+## Best practices of LLM prompting
+
+ガイドのこのセクションでは、プロンプトの結果を改善する傾向にあるベスト プラクティスのリストをまとめました。
+
+* 使用するモデルを選択する場合は、最新かつ最も機能的なモデルの方がパフォーマンスが向上する可能性があります。
+* シンプルで短いプロンプトから始めて、そこから繰り返します。
+* 指示はプロンプトの最初または最後に入力してください。大規模なコンテキストを扱う場合、モデルはさまざまな最適化を適用して、アテンションの複雑さが二次的に拡大するのを防ぎます。これにより、モデルはプロンプトの途中よりも最初または最後に注意を払うようになります。
+* 指示と、それが適用されるテキストを明確に区別してください。これについては、次のセクションで詳しく説明します。
+* タスクと望ましい結果 (その形式、長さ、スタイル、言語など) について具体的かつ説明的にします。
+* 曖昧な説明や指示は避けてください。
+*「何をしてはいけないか」という指示ではなく、「何をすべきか」という指示を優先します。
+* 最初の単語を書いて (またはモデルの最初の文を始めて)、出力を正しい方向に「導き」ます。
+* [Few-shot prompting](#few-shot-prompting) や [Chain-of-thought](#chain-of-thought) などの高度なテクニックを使用します。
+* さまざまなモデルでプロンプトをテストして、その堅牢性を評価します。
+* プロンプトのバージョンを確認し、パフォーマンスを追跡します。
+
+## Advanced prompting techniques
+
+### Few-shot prompting
+
+上記のセクションの基本的なプロンプトは、「ゼロショット」プロンプトの例です。つまり、モデルにはすでに与えられています。
+指示とコンテキストはありますが、解決策を含む例はありません。通常、命令データセットに基づいて微調整された LLM
+このような「ゼロショット」タスクでも優れたパフォーマンスを発揮します。ただし、タスクがより複雑であったり微妙な点があったりする場合があります。
+出力には、命令だけではモデルが理解できないいくつかの要件があります。この場合、次のことができます。
+少数ショット プロンプトと呼ばれるテクニックを試してください。
+
+少数ショット プロンプトでは、モデルにパフォーマンスを向上させるためのより多くのコンテキストを提供するプロンプト内の例が提供されます。
+例では、例のパターンに従って出力を生成するようにモデルを条件付けします。
+
+以下に例を示します。
+
+```python
+>>> torch.manual_seed(0) # doctest: +IGNORE_RESULT
+>>> prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961.
+... Date: 04/12/1961
+... Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+... Date:"""
+
+>>> sequences = pipe(
+...     prompt,
+...     max_new_tokens=8,
+...     do_sample=True,
+...     top_k=10,
+... )
+
+>>> for seq in sequences:
+...     print(f"Result: {seq['generated_text']}")
+Result: Text: The first human went into space and orbited the Earth on April 12, 1961.
+Date: 04/12/1961
+Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. 
+Date: 09/28/1960
+```
+
+上記のコード スニペットでは、モデルへの目的の出力を示すために 1 つの例を使用しました。したがって、これは、
+「ワンショット」プロンプト。ただし、タスクの複雑さに応じて、複数の例を使用する必要がある場合があります。
+
+数回のプロンプト手法の制限:
+- LLM は例のパターンを理解できますが、これらの手法は複雑な推論タスクではうまく機能しません。
+- 少数ショットのプロンプトでは、長いプロンプトを作成する必要があります。大量のトークンを含むプロンプトでは、計算量と待ち時間が増加する可能性があります。プロンプトの長さにも制限があります。
+- 多くの例を与えると、モデルが学習するつもりのなかったパターンを学習することがあります。 3番目の映画レビューはいつも否定的だということ。
+
+### Chain-of-thought
+
+思考連鎖 (CoT) プロンプトは、モデルを微調整して中間推論ステップを生成し、改善する手法です。
+複雑な推論タスクの結果。
+
+モデルを操作して推論ステップを生成するには、2 つの方法があります。
+- 質問に対する詳細な回答を含む例を示し、問題に対処する方法をモデルに示すことで、数回のプロンプトを表示します。
+- 「ステップごとに考えてみましょう」または「深呼吸して、問題をステップごとに解決してください」などのフレーズを追加してモデルに推論を指示します。
+
+[推論セクション](#reasoning) のマフィンの例に CoT テクニックを適用し、より大きなモデルを使用すると、
+[HuggingChat](https://huggingface.co/chat/)で遊べる(`tiiuae/falcon-180B-chat`)など、
+推論結果は大幅に改善されます。
+
+```text
+Let's go through this step-by-step:
+1. You start with 15 muffins.
+2. You eat 2 muffins, leaving you with 13 muffins.
+3. You give 5 muffins to your neighbor, leaving you with 8 muffins.
+4. Your partner buys 6 more muffins, bringing the total number of muffins to 14.
+5. Your partner eats 2 muffins, leaving you with 12 muffins.
+Therefore, you now have 12 muffins.
+```
+
+## Prompting vs fine-tuning
+
+プロンプトを最適化することで優れた結果を達成できますが、モデルを微調整するかどうかについてはまだ思案するかもしれません。
+あなたの場合にはもっとうまくいくでしょう。より小規模なモデルを微調整することが好ましいオプションである場合のいくつかのシナリオを次に示します。
+
+- ドメインが LLM が事前にトレーニングされたものと大きく異なっており、広範なプロンプト最適化では十分な結果が得られませんでした。
+- モデルが低リソース言語で適切に動作する必要があります。
+- 厳格な規制の下にある機密データでモデルをトレーニングする必要があります。
+- コスト、プライバシー、インフラストラクチャ、またはその他の制限により、小規模なモデルを使用する必要があります。
+
+上記のすべての例で、十分な大きさのファイルをすでに持っているか、簡単に入手できるかを確認する必要があります。
+ドメイン固有のデータセットを合理的なコストでモデルを微調整できます。十分な時間とリソースも必要になります
+モデルを微調整します。
+
+上記の例が当てはまらない場合は、プロンプトを最適化する方が有益であることがわかります。
diff --git a/docs/source/ja/tasks/question_answering.md b/docs/source/ja/tasks/question_answering.md
new file mode 100644
index 00000000000000..9c2ca869ffc5d6
--- /dev/null
+++ b/docs/source/ja/tasks/question_answering.md
@@ -0,0 +1,445 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Question answering
+
+[[open-in-colab]]
+
+<Youtube id="ajPx5LwJD-I"/>
+
+質問応答タスクは、質問に対して回答を返します。 Alexa、Siri、Google などの仮想アシスタントに天気を尋ねたことがあるなら、質問応答モデルを使用したことがあるはずです。質問応答タスクには一般的に 2 つのタイプがあります。
+
+- 抽出: 与えられたコンテキストから回答を抽出します。
+- 抽象的: 質問に正しく答えるコンテキストから回答を生成します。
+
+このガイドでは、次の方法を説明します。
+
+1. 抽出的質問応答用に [SQuAD](https://huggingface.co/datasets/squad) データセット上の [DistilBERT](https://huggingface.co/distilbert-base-uncased) を微調整します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+
+[ALBERT](../model_doc/albert), [BART](../model_doc/bart), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [OpenAI GPT-2](../model_doc/gpt2), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT-J](../model_doc/gptj), [I-BERT](../model_doc/ibert), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LED](../model_doc/led), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [LXMERT](../model_doc/lxmert), [MarkupLM](../model_doc/markuplm), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [OPT](../model_doc/opt), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [Splinter](../model_doc/splinter), [SqueezeBERT](../model_doc/squeezebert), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SQuAD dataset
+
+まず、🤗 データセット ライブラリから SQuAD データセットの小さいサブセットを読み込みます。これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+
+```py
+>>> from datasets import load_dataset
+
+>>> squad = load_dataset("squad", split="train[:5000]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train` 分割をトレイン セットとテスト セットに分割します。
+
+```py
+>>> squad = squad.train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> squad["train"][0]
+{'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
+ 'id': '5733be284776f41900661182',
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
+ 'title': 'University_of_Notre_Dame'
+}
+```
+
+ここにはいくつかの重要なフィールドがあります。
+
+- `answers`: 回答トークンと回答テキストの開始位置。
+- `context`: モデルが答えを抽出するために必要な背景情報。
+- `question`: モデルが答える必要がある質問。
+
+## Preprocess
+
+<Youtube id="qgaM0weJHpA"/>
+
+次のステップでは、DistilBERT トークナイザーをロードして`question`フィールドと`context`フィールドを処理します。
+
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+質問応答タスクに特有の、注意すべき前処理手順がいくつかあります。
+
+1. データセット内の一部の例には、モデルの最大入力長を超える非常に長い「コンテキスト」が含まれる場合があります。より長いシーケンスを処理するには、`truncation="only_second"` を設定して `context` のみを切り捨てます。
+2. 次に、設定によって、回答の開始位置と終了位置を元の `context`にマッピングします。
+   「`return_offset_mapping=True`」。
+3. マッピングが手元にあるので、答えの開始トークンと終了トークンを見つけることができます。 [`~tokenizers.Encoding.sequence_ids`] メソッドを使用して、
+   オフセットのどの部分が`question`に対応し、どの部分が`context`に対応するかを見つけます。
+
+以下に、`answer`の開始トークンと終了トークンを切り詰めて`context`にマッピングする関数を作成する方法を示します。
+
+```py
+>>> def preprocess_function(examples):
+...     questions = [q.strip() for q in examples["question"]]
+...     inputs = tokenizer(
+...         questions,
+...         examples["context"],
+...         max_length=384,
+...         truncation="only_second",
+...         return_offsets_mapping=True,
+...         padding="max_length",
+...     )
+
+...     offset_mapping = inputs.pop("offset_mapping")
+...     answers = examples["answers"]
+...     start_positions = []
+...     end_positions = []
+
+...     for i, offset in enumerate(offset_mapping):
+...         answer = answers[i]
+...         start_char = answer["answer_start"][0]
+...         end_char = answer["answer_start"][0] + len(answer["text"][0])
+...         sequence_ids = inputs.sequence_ids(i)
+
+...         # Find the start and end of the context
+...         idx = 0
+...         while sequence_ids[idx] != 1:
+...             idx += 1
+...         context_start = idx
+...         while sequence_ids[idx] == 1:
+...             idx += 1
+...         context_end = idx - 1
+
+...         # If the answer is not fully inside the context, label it (0, 0)
+...         if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
+...             start_positions.append(0)
+...             end_positions.append(0)
+...         else:
+...             # Otherwise it's the start and end token positions
+...             idx = context_start
+...             while idx <= context_end and offset[idx][0] <= start_char:
+...                 idx += 1
+...             start_positions.append(idx - 1)
+
+...             idx = context_end
+...             while idx >= context_start and offset[idx][1] >= end_char:
+...                 idx -= 1
+...             end_positions.append(idx + 1)
+
+...     inputs["start_positions"] = start_positions
+...     inputs["end_positions"] = end_positions
+...     return inputs
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` 関数を高速化できます。不要な列を削除します。
+
+```py
+>>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
+```
+
+次に、[`DefaultDataCollat​​or`] を使用してサンプルのバッチを作成します。 🤗 Transformers の他のデータ照合器とは異なり、[`DefaultDataCollat​​or`] はパディングなどの追加の前処理を適用しません。
+
+<frameworkcontent>
+<pt>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+</pt>
+<tf>
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForQuestionAnswering`] を使用して DitilBERT をロードします。
+
+```py
+>>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。
+2. トレーニング引数をモデル、データセット、トークナイザー、データ照合器とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_qa_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=3,
+...     weight_decay=0.01,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_squad["train"],
+...     eval_dataset=tokenized_squad["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+</ヒント>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_epochs = 2
+>>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
+>>> optimizer, schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_warmup_steps=0,
+...     num_train_steps=total_train_steps,
+... )
+```
+
+次に、[`TFAutoModelForQuestionAnswering`] を使用して DistilBERT をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering("distilbert-base-uncased")
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_squad["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_squad["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)
+```
+
+トレーニングを開始する前に最後にセットアップすることは、モデルをハブにプッシュする方法を提供することです。これは、モデルとトークナイザーを [`~transformers.PushToHubCallback`] でプッシュする場所を指定することで実行できます。
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> callback = PushToHubCallback(
+...     output_dir="my_awesome_qa_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+質問応答用のモデルを微調整する方法の詳細な例については、対応するドキュメントを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)。
+
+</Tip>
+
+## Evaluate
+
+質問応答の評価には、大量の後処理が必要です。時間がかかりすぎないように、このガイドでは評価ステップを省略しています。 [`Trainer`] はトレーニング中に評価損失を計算するため、モデルのパフォーマンスについて完全に分からないわけではありません。
+
+もっと時間があり、質問応答用のモデルを評価する方法に興味がある場合は、[質問応答](https://huggingface.co/course/chapter7/7?fw=pt#postprocessing) の章を参照してください。 🤗ハグフェイスコースから！
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+質問と、モデルに予測させたいコンテキストを考え出します。
+
+```py
+>>> question = "How many programming languages does BLOOM support?"
+>>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して質問応答用の`pipeline`をインスタンス化し、それにテキストを渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
+>>> question_answerer(question=question, context=context)
+{'score': 0.2058267742395401,
+ 'start': 10,
+ 'end': 95,
+ 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'}
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+
+テキストをトークン化して PyTorch テンソルを返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, context, return_tensors="pt")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+
+```py
+>>> import torch
+>>> from transformers import AutoModelForQuestionAnswering
+
+>>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+```
+
+モデル出力から開始位置と終了位置の最も高い確率を取得します。
+
+```py
+>>> answer_start_index = outputs.start_logits.argmax()
+>>> answer_end_index = outputs.end_logits.argmax()
+```
+
+予測されたトークンをデコードして答えを取得します。
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+</pt>
+<tf>
+
+テキストをトークン化し、TensorFlow テンソルを返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
+>>> inputs = tokenizer(question, text, return_tensors="tf")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+
+```py
+>>> from transformers import TFAutoModelForQuestionAnswering
+
+>>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
+>>> outputs = model(**inputs)
+```
+
+モデル出力から開始位置と終了位置の最も高い確率を取得します。
+
+```py
+>>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
+>>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
+```
+
+予測されたトークンをデコードして答えを取得します。
+
+```py
+>>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+>>> tokenizer.decode(predict_answer_tokens)
+'176 billion parameters and can generate text in 46 languages natural languages and 13'
+```
+
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md
new file mode 100644
index 00000000000000..2816688b4e1c14
--- /dev/null
+++ b/docs/source/ja/tasks/semantic_segmentation.md
@@ -0,0 +1,605 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Semantic segmentation
+
+[[open-in-colab]]
+
+<Youtube id="dKE8SIt9C-w"/>
+
+セマンティック セグメンテーションでは、画像の個々のピクセルにラベルまたはクラスを割り当てます。セグメンテーションにはいくつかのタイプがありますが、セマンティック セグメンテーションの場合、同じオブジェクトの一意のインスタンス間の区別は行われません。両方のオブジェクトに同じラベルが付けられます (たとえば、`car-1`と`car-2`の代わりに`car`)。セマンティック セグメンテーションの一般的な現実世界のアプリケーションには、歩行者や重要な交通情報を識別するための自動運転車のトレーニング、医療画像内の細胞と異常の識別、衛星画像からの環境変化の監視などが含まれます。
+
+このガイドでは、次の方法を説明します。
+
+1. [SceneParse150](https://huggingface.co/datasets/scene_parse_150) データセットの [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) を微調整します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q datasets transformers evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SceneParse150 dataset
+
+まず、SceneParse150 データセットの小さいサブセットを 🤗 データセット ライブラリから読み込みます。これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> ds = load_dataset("scene_parse_150", split="train[:50]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train` 分割をトレイン セットとテスト セットに分割します。
+
+
+```py
+>>> ds = ds.train_test_split(test_size=0.2)
+>>> train_ds = ds["train"]
+>>> test_ds = ds["test"]
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> train_ds[0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x683 at 0x7F9B0C201F90>,
+ 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x683 at 0x7F9B0C201DD0>,
+ 'scene_category': 368}
+```
+
+- `image`: シーンの PIL イメージ。
+- `annotation`: セグメンテーション マップの PIL イメージ。モデルのターゲットでもあります。
+- `scene_category`: "kitchen"や"office"などの画像シーンを説明するカテゴリ ID。このガイドでは、`image`と`annotation`のみが必要になります。どちらも PIL イメージです。
+
+また、ラベル ID をラベル クラスにマップする辞書を作成することもできます。これは、後でモデルを設定するときに役立ちます。ハブからマッピングをダウンロードし、`id2label` および `label2id` ディクショナリを作成します。
+
+```py
+>>> import json
+>>> from huggingface_hub import cached_download, hf_hub_url
+
+>>> repo_id = "huggingface/label-files"
+>>> filename = "ade20k-id2label.json"
+>>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+>>> id2label = {int(k): v for k, v in id2label.items()}
+>>> label2id = {v: k for k, v in id2label.items()}
+>>> num_labels = len(id2label)
+```
+
+## Preprocess
+
+次のステップでは、SegFormer 画像プロセッサをロードして、モデルの画像と注釈を準備します。このデータセットのような一部のデータセットは、バックグラウンド クラスとしてゼロインデックスを使用します。ただし、実際には背景クラスは 150 個のクラスに含まれていないため、`reduce_labels=True`を設定してすべてのラベルから 1 つを引く必要があります。ゼロインデックスは `255` に置き換えられるため、SegFormer の損失関数によって無視されます。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "nvidia/mit-b0"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+```
+
+<frameworkcontent>
+<pt>
+
+モデルを過学習に対してより堅牢にするために、画像データセットにいくつかのデータ拡張を適用するのが一般的です。このガイドでは、[torchvision](https://pytorch.org/vision/stable/index.html) の [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) 関数を使用します。 ) を使用して画像の色のプロパティをランダムに変更しますが、任意の画像ライブラリを使用することもできます。
+
+```py
+>>> from torchvision.transforms import ColorJitter
+
+>>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
+```
+
+次に、モデルの画像と注釈を準備するための 2 つの前処理関数を作成します。これらの関数は、画像を`pixel_values`に変換し、注釈を`labels`に変換します。トレーニング セットの場合、画像を画像プロセッサに提供する前に `jitter` が適用されます。テスト セットの場合、テスト中にデータ拡張が適用されないため、画像プロセッサは`images`を切り取って正規化し、`ラベル`のみを切り取ります。
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [jitter(x) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [x for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+データセット全体に`jitter`を適用するには、🤗 Datasets [`~datasets.Dataset.set_transform`] 関数を使用します。変換はオンザフライで適用されるため、高速で消費するディスク容量が少なくなります。
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+モデルを過学習に対してより堅牢にするために、画像データセットにいくつかのデータ拡張を適用するのが一般的です。
+このガイドでは、[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image) を使用して画像の色のプロパティをランダムに変更しますが、任意のプロパティを使用することもできます。画像
+好きな図書館。
+2 つの別々の変換関数を定義します。
+- 画像拡張を含むトレーニング データ変換
+- 🤗 Transformers のコンピューター ビジョン モデルはチャネル優先のレイアウトを想定しているため、画像を転置するだけの検証データ変換
+
+```py
+>>> import tensorflow as tf
+
+
+>>> def aug_transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.image.random_brightness(image, 0.25)
+...     image = tf.image.random_contrast(image, 0.5, 2.0)
+...     image = tf.image.random_saturation(image, 0.75, 1.25)
+...     image = tf.image.random_hue(image, 0.1)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+
+
+>>> def transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+```
+
+次に、モデルの画像と注釈のバッチを準備する 2 つの前処理関数を作成します。これらの機能が適用されます
+画像変換を行い、以前にロードされた `image_processor` を使用して画像を `pixel_values` に変換し、
+`labels`への注釈。 `ImageProcessor` は、画像のサイズ変更と正規化も処理します。
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+データセット全体に前処理変換を適用するには、🤗 Datasets [`~datasets.Dataset.set_transform`] 関数を使用します。
+変換はオンザフライで適用されるため、高速で消費するディスク容量が少なくなります。
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[Mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) メトリックをロードします (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照して、メトリクスをロードして計算する方法の詳細を確認してください)。
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("mean_iou")
+```
+
+次に、メトリクスを [`~evaluate.EvaluationModule.compute`] する関数を作成します。予測を次のように変換する必要があります
+最初にロジットを作成し、次に [`~evaluate.EvaluationModule.compute`] を呼び出す前にラベルのサイズに一致するように再形成します。
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> import numpy as np
+>>> import torch
+>>> from torch import nn
+
+>>> def compute_metrics(eval_pred):
+...     with torch.no_grad():
+...         logits, labels = eval_pred
+...         logits_tensor = torch.from_numpy(logits)
+...         logits_tensor = nn.functional.interpolate(
+...             logits_tensor,
+...             size=labels.shape[-2:],
+...             mode="bilinear",
+...             align_corners=False,
+...         ).argmax(dim=1)
+
+...         pred_labels = logits_tensor.detach().cpu().numpy()
+...         metrics = metric.compute(
+...             predictions=pred_labels,
+...             references=labels,
+...             num_labels=num_labels,
+...             ignore_index=255,
+...             reduce_labels=False,
+...         )
+...         for key, value in metrics.items():
+...             if type(value) is np.ndarray:
+...                 metrics[key] = value.tolist()
+...         return metrics
+```
+
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     logits = tf.transpose(logits, perm=[0, 2, 3, 1])
+...     logits_resized = tf.image.resize(
+...         logits,
+...         size=tf.shape(labels)[1:],
+...         method="bilinear",
+...     )
+
+...     pred_labels = tf.argmax(logits_resized, axis=-1)
+...     metrics = metric.compute(
+...         predictions=pred_labels,
+...         references=labels,
+...         num_labels=num_labels,
+...         ignore_index=-1,
+...         reduce_labels=image_processor.do_reduce_labels,
+...     )
+
+...     per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
+...     per_category_iou = metrics.pop("per_category_iou").tolist()
+
+...     metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
+...     metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
+...     return {"val_" + k: v for k, v in metrics.items()}
+```
+
+</tf>
+</frameworkcontent>
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#finetune-with-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForSemanticSegmentation`] を使用して SegFormer をロードし、ラベル ID とラベル クラス間のマッピングをモデルに渡します。
+
+```py
+>>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
+
+>>> model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。 `image` 列が削除されるため、未使用の列を削除しないことが重要です。 `image` 列がないと、`pixel_values` を作成できません。この動作を防ぐには、`remove_unused_columns=False`を設定してください。他に必要なパラメータは、モデルの保存場所を指定する `output_dir` だけです。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は IoU メトリックを評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="segformer-b0-scene-parse-150",
+...     learning_rate=6e-5,
+...     num_train_epochs=50,
+...     per_device_train_batch_size=2,
+...     per_device_eval_batch_size=2,
+...     save_total_limit=3,
+...     evaluation_strategy="steps",
+...     save_strategy="steps",
+...     save_steps=20,
+...     eval_steps=20,
+...     logging_steps=1,
+...     eval_accumulation_steps=5,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=train_ds,
+...     eval_dataset=test_ds,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、まず [基本チュートリアル](./training#train-a-tensorflow-model-with-keras) を確認してください。
+
+</Tip>
+
+TensorFlow でモデルを微調整するには、次の手順に従います。
+1. トレーニングのハイパーパラメータを定義し、オプティマイザーと学習率スケジュールを設定します。
+2. 事前トレーニングされたモデルをインスタンス化します。
+3. 🤗 データセットを `tf.data.Dataset` に変換します。
+4. モデルをコンパイルします。
+5. コールバックを追加してメトリクスを計算し、モデルを 🤗 Hub にアップロードします
+6. `fit()` メソッドを使用してトレーニングを実行します。
+
+まず、ハイパーパラメーター、オプティマイザー、学習率スケジュールを定義します。
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 2
+>>> num_epochs = 50
+>>> num_train_steps = len(train_ds) * num_epochs
+>>> learning_rate = 6e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+次に、ラベル マッピングとともに [`TFAutoModelForSemanticSegmentation`] を使用して SegFormer をロードし、それをコンパイルします。
+オプティマイザ。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+[`~datasets.Dataset.to_tf_dataset`] と [`DefaultDataCollat​​or`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+
+>>> tf_train_dataset = train_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_eval_dataset = test_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+予測から精度を計算し、モデルを 🤗 ハブにプッシュするには、[Keras callbacks](../main_classes/keras_callbacks) を使用します。
+`compute_metrics` 関数を [`KerasMetricCallback`] に渡します。
+そして [`PushToHubCallback`] を使用してモデルをアップロードします。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(
+...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
+... )
+
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルをトレーニングする準備が整いました。トレーニングおよび検証データセット、エポック数、
+モデルを微調整するためのコールバック:
+
+
+```py
+>>> model.fit(
+...     tf_train_dataset,
+...     validation_data=tf_eval_dataset,
+...     callbacks=callbacks,
+...     epochs=num_epochs,
+... )
+```
+
+おめでとう！モデルを微調整し、🤗 Hub で共有しました。これで推論に使用できるようになりました。
+</tf>
+</frameworkcontent>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論のために画像をロードします。
+
+```py
+>>> image = ds[0]["image"]
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="Image of bedroom"/>
+</div>
+
+<frameworkcontent>
+<pt>
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して画像セグメンテーション用の `pipeline`をインスタンス化し、それに画像を渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
+>>> segmenter(image)
+[{'score': None,
+  'label': 'wall',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
+ {'score': None,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
+ {'score': None,
+  'label': 'floor',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
+ {'score': None,
+  'label': 'ceiling',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
+ {'score': None,
+  'label': 'bed ',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
+ {'score': None,
+  'label': 'windowpane',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
+ {'score': None,
+  'label': 'cabinet',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
+ {'score': None,
+  'label': 'chair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
+ {'score': None,
+  'label': 'armchair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。画像を画像プロセッサで処理し、`pixel_values` を GPU に配置します。
+
+```py
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
+>>> encoding = image_processor(image, return_tensors="pt")
+>>> pixel_values = encoding.pixel_values.to(device)
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+
+```py
+>>> outputs = model(pixel_values=pixel_values)
+>>> logits = outputs.logits.cpu()
+```
+
+次に、ロジットを元の画像サイズに再スケールします。
+
+```py
+>>> upsampled_logits = nn.functional.interpolate(
+...     logits,
+...     size=image.size[::-1],
+...     mode="bilinear",
+...     align_corners=False,
+... )
+
+>>> pred_seg = upsampled_logits.argmax(dim=1)[0]
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+画像プロセッサをロードして画像を前処理し、入力を TensorFlow テンソルとして返します。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation")
+>>> logits = model(**inputs).logits
+```
+
+次に、ロジットを元の画像サイズに再スケールし、クラス次元に argmax を適用します。
+
+```py
+>>> logits = tf.transpose(logits, [0, 2, 3, 1])
+
+>>> upsampled_logits = tf.image.resize(
+...     logits,
+...     # We reverse the shape of `image` because `image.size` returns width and height.
+...     image.size[::-1],
+... )
+
+>>> pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0]
+```
+
+</tf>
+</frameworkcontent>
+
+結果を視覚化するには、[データセット カラー パレット](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) を、それぞれをマップする `ade_palette()` としてロードします。クラスを RGB 値に変換します。次に、画像と予測されたセグメンテーション マップを組み合わせてプロットできます。
+
+```py
+>>> import matplotlib.pyplot as plt
+>>> import numpy as np
+
+>>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
+>>> palette = np.array(ade_palette())
+>>> for label, color in enumerate(palette):
+...     color_seg[pred_seg == label, :] = color
+>>> color_seg = color_seg[..., ::-1]  # convert to BGR
+
+>>> img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map
+>>> img = img.astype(np.uint8)
+
+>>> plt.figure(figsize=(15, 10))
+>>> plt.imshow(img)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
+</div>
diff --git a/docs/source/ja/tasks/sequence_classification.md b/docs/source/ja/tasks/sequence_classification.md
new file mode 100644
index 00000000000000..abbf79ad4b566d
--- /dev/null
+++ b/docs/source/ja/tasks/sequence_classification.md
@@ -0,0 +1,609 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Sequence classification
+
+[[open-in-colab]]
+
+<Youtube id="dKE8SIt9C-w"/>
+
+セマンティック セグメンテーションでは、画像の個々のピクセルにラベルまたはクラスを割り当てます。セグメンテーションにはいくつかのタイプがありますが、セマンティック セグメンテーションの場合、同じオブジェクトの一意のインスタンス間の区別は行われません。両方のオブジェクトに同じラベルが付けられます (たとえば、「car-1」と「car-2」の代わりに「car」)。セマンティック セグメンテーションの一般的な現実世界のアプリケーションには、歩行者や重要な交通情報を識別するための自動運転車のトレーニング、医療画像内の細胞と異常の識別、衛星画像からの環境変化の監視などが含まれます。
+
+このガイドでは、次の方法を説明します。
+
+1. [SceneParse150](https://huggingface.co/datasets/scene_parse_150) データセットの [SegFormer](https://huggingface.co/docs/transformers/main/en/model_doc/segformer#segformer) を微調整します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BEiT](../model_doc/beit), [Data2VecVision](../model_doc/data2vec-vision), [DPT](../model_doc/dpt), [MobileNetV2](../model_doc/mobilenet_v2), [MobileViT](../model_doc/mobilevit), [MobileViTV2](../model_doc/mobilevitv2), [SegFormer](../model_doc/segformer), [UPerNet](../model_doc/upernet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q datasets transformers evaluate
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load SceneParse150 dataset
+
+
+まず、SceneParse150 データセットの小さいサブセットを 🤗 データセット ライブラリから読み込みます。これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> ds = load_dataset("scene_parse_150", split="train[:50]")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットの `train` 分割をトレイン セットとテスト セットに分割します。
+
+```py
+>>> ds = ds.train_test_split(test_size=0.2)
+>>> train_ds = ds["train"]
+>>> test_ds = ds["test"]
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> train_ds[0]
+{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x683 at 0x7F9B0C201F90>,
+ 'annotation': <PIL.PngImagePlugin.PngImageFile image mode=L size=512x683 at 0x7F9B0C201DD0>,
+ 'scene_category': 368}
+```
+
+- `image`: シーンの PIL イメージ。
+- `annotation`: セグメンテーション マップの PIL イメージ。モデルのターゲットでもあります。
+- `scene_category`: 「キッチン」や「オフィス」などの画像シーンを説明するカテゴリ ID。このガイドでは、「image」と「annotation」のみが必要になります。どちらも PIL イメージです。
+
+また、ラベル ID をラベル クラスにマップする辞書を作成することもできます。これは、後でモデルを設定するときに役立ちます。ハブからマッピングをダウンロードし、`id2label` および `label2id` ディクショナリを作成します。
+
+```py
+>>> import json
+>>> from huggingface_hub import cached_download, hf_hub_url
+
+>>> repo_id = "huggingface/label-files"
+>>> filename = "ade20k-id2label.json"
+>>> id2label = json.load(open(cached_download(hf_hub_url(repo_id, filename, repo_type="dataset")), "r"))
+>>> id2label = {int(k): v for k, v in id2label.items()}
+>>> label2id = {v: k for k, v in id2label.items()}
+>>> num_labels = len(id2label)
+```
+
+## Preprocess
+
+次のステップでは、SegFormer 画像プロセッサをロードして、モデルの画像と注釈を準備します。このデータセットのような一部のデータセットは、バックグラウンド クラスとしてゼロインデックスを使用します。ただし、実際には背景クラスは 150 個のクラスに含まれていないため、`reduce_labels=True`を設定してすべてのラベルから 1 つを引く必要があります。ゼロインデックスは `255` に置き換えられるため、SegFormer の損失関数によって無視されます。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> checkpoint = "nvidia/mit-b0"
+>>> image_processor = AutoImageProcessor.from_pretrained(checkpoint, reduce_labels=True)
+```
+
+<frameworkcontent>
+<pt>
+
+モデルを過学習に対してより堅牢にするために、画像データセットにいくつかのデータ拡張を適用するのが一般的です。このガイドでは、[torchvision](https://pytorch.org) の [`ColorJitter`](https://pytorch.org/vision/stable/generated/torchvision.transforms.ColorJitter.html) 関数を使用します。 /vision/stable/index.html) を使用して画像の色のプロパティをランダムに変更しますが、任意の画像ライブラリを使用することもできます。
+
+```py
+>>> from torchvision.transforms import ColorJitter
+
+>>> jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1)
+```
+
+次に、モデルの画像と注釈を準備するための 2 つの前処理関数を作成します。これらの関数は、画像を`pixel_values`に変換し、注釈を`labels`に変換します。トレーニング セットの場合、画像を画像プロセッサに提供する前に`jitter`が適用されます。テスト セットの場合、テスト中にデータ拡張が適用されないため、画像プロセッサは`images`を切り取って正規化し、`labels` のみを切り取ります。
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [jitter(x) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [x for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+データセット全体に`jitter`を適用するには、🤗 Datasets [`~datasets.Dataset.set_transform`] 関数を使用します。変換はオンザフライで適用されるため、高速で消費するディスク容量が少なくなります。
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+モデルを過学習に対してより堅牢にするために、画像データセットにいくつかのデータ拡張を適用するのが一般的です。
+このガイドでは、[`tf.image`](https://www.tensorflow.org/api_docs/python/tf/image) を使用して画像の色のプロパティをランダムに変更しますが、任意のプロパティを使用することもできます。画像
+好きな図書館。
+2 つの別々の変換関数を定義します。
+- 画像拡張を含むトレーニング データ変換
+- 🤗 Transformers のコンピューター ビジョン モデルはチャネル優先のレイアウトを想定しているため、画像を転置するだけの検証データ変換
+
+```py
+>>> import tensorflow as tf
+
+
+>>> def aug_transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.image.random_brightness(image, 0.25)
+...     image = tf.image.random_contrast(image, 0.5, 2.0)
+...     image = tf.image.random_saturation(image, 0.75, 1.25)
+...     image = tf.image.random_hue(image, 0.1)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+
+
+>>> def transforms(image):
+...     image = tf.keras.utils.img_to_array(image)
+...     image = tf.transpose(image, (2, 0, 1))
+...     return image
+```
+
+次に、モデルの画像と注釈のバッチを準備する 2 つの前処理関数を作成します。これらの機能が適用されます
+画像変換を行い、以前にロードされた `image_processor` を使用して画像を `pixel_values` に変換し、
+`labels`への注釈。 `ImageProcessor` は、画像のサイズ変更と正規化も処理します。
+
+```py
+>>> def train_transforms(example_batch):
+...     images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+
+
+>>> def val_transforms(example_batch):
+...     images = [transforms(x.convert("RGB")) for x in example_batch["image"]]
+...     labels = [x for x in example_batch["annotation"]]
+...     inputs = image_processor(images, labels)
+...     return inputs
+```
+
+データセット全体に前処理変換を適用するには、🤗 Datasets [`~datasets.Dataset.set_transform`] 関数を使用します。
+変換はオンザフライで適用されるため、高速で消費するディスク容量が少なくなります。
+
+```py
+>>> train_ds.set_transform(train_transforms)
+>>> test_ds.set_transform(val_transforms)
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[Mean Intersection over Union](https://huggingface.co/spaces/evaluate-metric/accuracy) (IoU) メトリックをロードします (🤗 Evaluate [クイック ツアー](https://huggingface.co) を参照してください) /docs/evaluate/a_quick_tour) を参照して、メトリクスをロードして計算する方法の詳細を確認してください)。
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("mean_iou")
+```
+
+次に、メトリクスを [`~evaluate.EvaluationModule.compute`] する関数を作成します。予測を次のように変換する必要があります
+最初にロジットを作成し、次に [`~evaluate.EvaluationModule.compute`] を呼び出す前にラベルのサイズに一致するように再形成します。
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> import numpy as np
+>>> import torch
+>>> from torch import nn
+
+>>> def compute_metrics(eval_pred):
+...     with torch.no_grad():
+...         logits, labels = eval_pred
+...         logits_tensor = torch.from_numpy(logits)
+...         logits_tensor = nn.functional.interpolate(
+...             logits_tensor,
+...             size=labels.shape[-2:],
+...             mode="bilinear",
+...             align_corners=False,
+...         ).argmax(dim=1)
+
+...         pred_labels = logits_tensor.detach().cpu().numpy()
+...         metrics = metric.compute(
+...             predictions=pred_labels,
+...             references=labels,
+...             num_labels=num_labels,
+...             ignore_index=255,
+...             reduce_labels=False,
+...         )
+...         for key, value in metrics.items():
+...             if type(value) is np.ndarray:
+...                 metrics[key] = value.tolist()
+...         return metrics
+```
+
+</pt>
+</frameworkcontent>
+
+
+<frameworkcontent>
+<tf>
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     logits = tf.transpose(logits, perm=[0, 2, 3, 1])
+...     logits_resized = tf.image.resize(
+...         logits,
+...         size=tf.shape(labels)[1:],
+...         method="bilinear",
+...     )
+
+...     pred_labels = tf.argmax(logits_resized, axis=-1)
+...     metrics = metric.compute(
+...         predictions=pred_labels,
+...         references=labels,
+...         num_labels=num_labels,
+...         ignore_index=-1,
+...         reduce_labels=image_processor.do_reduce_labels,
+...     )
+
+...     per_category_accuracy = metrics.pop("per_category_accuracy").tolist()
+...     per_category_iou = metrics.pop("per_category_iou").tolist()
+
+...     metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)})
+...     metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)})
+...     return {"val_" + k: v for k, v in metrics.items()}
+```
+
+</tf>
+</frameworkcontent>
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[こちら](../training#finetune-with-trainer) の基本的なチュートリアルをご覧ください。
+
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForSemanticSegmentation`] を使用して SegFormer をロードし、ラベル ID とラベル クラス間のマッピングをモデルに渡します。
+
+```py
+>>> from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer
+
+>>> model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id)
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。 `image` 列が削除されるため、未使用の列を削除しないことが重要です。 `image` 列がないと、`pixel_values` を作成できません。この動作を防ぐには、`remove_unused_columns=False`を設定してください。他に必要なパラメータは、モデルの保存場所を指定する `output_dir` だけです。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は IoU メトリックを評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="segformer-b0-scene-parse-150",
+...     learning_rate=6e-5,
+...     num_train_epochs=50,
+...     per_device_train_batch_size=2,
+...     per_device_eval_batch_size=2,
+...     save_total_limit=3,
+...     evaluation_strategy="steps",
+...     save_strategy="steps",
+...     save_steps=20,
+...     eval_steps=20,
+...     logging_steps=1,
+...     eval_accumulation_steps=5,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=train_ds,
+...     eval_dataset=test_ds,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、まず [基本チュートリアル](./training#train-a-tensorflow-model-with-keras) を確認してください。
+
+</Tip>
+
+TensorFlow でモデルを微調整するには、次の手順に従います。
+1. トレーニングのハイパーパラメータを定義し、オプティマイザーと学習率スケジュールを設定します。
+2. 事前トレーニングされたモデルをインスタンス化します。
+3. 🤗 データセットを `tf.data.Dataset` に変換します。
+4. モデルをコンパイルします。
+5. コールバックを追加してメトリクスを計算し、モデルを 🤗 Hub にアップロードします
+6. `fit()` メソッドを使用してトレーニングを実行します。
+
+まず、ハイパーパラメーター、オプティマイザー、学習率スケジュールを定義します。
+
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 2
+>>> num_epochs = 50
+>>> num_train_steps = len(train_ds) * num_epochs
+>>> learning_rate = 6e-5
+>>> weight_decay_rate = 0.01
+
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=learning_rate,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=weight_decay_rate,
+...     num_warmup_steps=0,
+... )
+```
+
+次に、ラベル マッピングとともに [`TFAutoModelForSemanticSegmentation`] を使用して SegFormer をロードし、それをコンパイルします。
+オプティマイザ。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained(
+...     checkpoint,
+...     id2label=id2label,
+...     label2id=label2id,
+... )
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+[`~datasets.Dataset.to_tf_dataset`] と [`DefaultDataCollat​​or`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator(return_tensors="tf")
+
+>>> tf_train_dataset = train_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_eval_dataset = test_ds.to_tf_dataset(
+...     columns=["pixel_values", "label"],
+...     shuffle=True,
+...     batch_size=batch_size,
+...     collate_fn=data_collator,
+... )
+```
+
+予測から精度を計算し、モデルを 🤗 ハブにプッシュするには、[Keras callbacks](../main_classes/keras_callbacks) を使用します。
+`compute_metrics` 関数を [`KerasMetricCallback`] に渡します。
+そして [`PushToHubCallback`] を使用してモデルをアップロードします。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
+
+>>> metric_callback = KerasMetricCallback(
+...     metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"]
+... )
+
+>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor)
+
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルをトレーニングする準備が整いました。`fit()`トレーニングおよび検証データセット、エポック数、
+モデルを微調整するためのコールバック:
+
+```py
+>>> model.fit(
+...     tf_train_dataset,
+...     validation_data=tf_eval_dataset,
+...     callbacks=callbacks,
+...     epochs=num_epochs,
+... )
+```
+
+おめでとう！モデルを微調整し、🤗 Hub で共有しました。これで推論に使用できるようになりました。
+
+</tf>
+</frameworkcontent>
+
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論のために画像をロードします。
+
+```py
+>>> image = ds[0]["image"]
+>>> image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-image.png" alt="Image of bedroom"/>
+</div>
+
+<frameworkcontent>
+<pt>
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して画像セグメンテーション用の `pipeline` をインスタンス化し、それに画像を渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline("image-segmentation", model="my_awesome_seg_model")
+>>> segmenter(image)
+[{'score': None,
+  'label': 'wall',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062690>},
+ {'score': None,
+  'label': 'sky',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A50>},
+ {'score': None,
+  'label': 'floor',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062B50>},
+ {'score': None,
+  'label': 'ceiling',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062A10>},
+ {'score': None,
+  'label': 'bed ',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E90>},
+ {'score': None,
+  'label': 'windowpane',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062390>},
+ {'score': None,
+  'label': 'cabinet',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062550>},
+ {'score': None,
+  'label': 'chair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062D90>},
+ {'score': None,
+  'label': 'armchair',
+  'mask': <PIL.Image.Image image mode=L size=640x427 at 0x7FD5B2062E10>}]
+```
+
+必要に応じて、`pipeline` の結果を手動で複製することもできます。画像プロセッサで画像を処理し、`pixel_values`を GPU に配置します。
+
+```py
+>>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # use GPU if available, otherwise use a CPU
+>>> encoding = image_processor(image, return_tensors="pt")
+>>> pixel_values = encoding.pixel_values.to(device)
+```
+
+入力をモデルに渡し、「logits」を返します。
+
+```py
+>>> outputs = model(pixel_values=pixel_values)
+>>> logits = outputs.logits.cpu()
+```
+
+次に、ロジットを元の画像サイズに再スケールします。
+
+
+```py
+>>> upsampled_logits = nn.functional.interpolate(
+...     logits,
+...     size=image.size[::-1],
+...     mode="bilinear",
+...     align_corners=False,
+... )
+
+>>> pred_seg = upsampled_logits.argmax(dim=1)[0]
+```
+```
+
+</pt>
+</frameworkcontent>
+
+<frameworkcontent>
+<tf>
+
+画像プロセッサをロードして画像を前処理し、入力を TensorFlow テンソルとして返します。
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation")
+>>> inputs = image_processor(image, return_tensors="tf")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+```py
+>>> from transformers import TFAutoModelForSemanticSegmentation
+
+>>> model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation")
+>>> logits = model(**inputs).logits
+```
+
+次に、ロジットを元の画像サイズに再スケールし、クラス次元に argmax を適用します。
+
+```py
+>>> logits = tf.transpose(logits, [0, 2, 3, 1])
+
+>>> upsampled_logits = tf.image.resize(
+...     logits,
+...     # We reverse the shape of `image` because `image.size` returns width and height.
+...     image.size[::-1],
+... )
+
+>>> pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0]
+```
+
+</tf>
+</frameworkcontent>
+
+結果を視覚化するには、[データセット カラー パレット](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) を、それぞれをマップする `ade_palette()` としてロードします。クラスを RGB 値に変換します。次に、画像と予測されたセグメンテーション マップを組み合わせてプロットできます。
+
+```py
+>>> import matplotlib.pyplot as plt
+>>> import numpy as np
+
+>>> color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8)
+>>> palette = np.array(ade_palette())
+>>> for label, color in enumerate(palette):
+...     color_seg[pred_seg == label, :] = color
+>>> color_seg = color_seg[..., ::-1]  # convert to BGR
+
+>>> img = np.array(image) * 0.5 + color_seg * 0.5  # plot the image with the segmentation map
+>>> img = img.astype(np.uint8)
+
+>>> plt.figure(figsize=(15, 10))
+>>> plt.imshow(img)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/semantic-seg-preds.png" alt="Image of bedroom overlaid with segmentation map"/>
+</div>
diff --git a/docs/source/ja/tasks/summarization.md b/docs/source/ja/tasks/summarization.md
new file mode 100644
index 00000000000000..47b04888d4865f
--- /dev/null
+++ b/docs/source/ja/tasks/summarization.md
@@ -0,0 +1,409 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Summarization
+
+[[open-in-colab]]
+
+<Youtube id="yHnr5Dk2zCI"/>
+
+要約により、すべての重要な情報をまとめた短いバージョンの文書または記事が作成されます。これは、翻訳と並んで、シーケンス間のタスクとして定式化できるタスクのもう 1 つの例です。要約は次のようになります。
+
+- 抽出: 文書から最も関連性の高い情報を抽出します。
+- 抽象的: 最も関連性の高い情報を捉えた新しいテキストを生成します。
+
+このガイドでは、次の方法を説明します。
+
+1. 抽象的な要約のために、[BillSum](https://huggingface.co/datasets/billsum) データセットのカリフォルニア州請求書サブセットで [T5](https://huggingface.co/t5-small) を微調整します。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate rouge_score
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load BillSum dataset
+
+まず、🤗 データセット ライブラリから BillSum データセットの小さいカリフォルニア州請求書サブセットを読み込みます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> billsum = load_dataset("billsum", split="ca_test")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットをトレイン セットとテスト セットに分割します。
+
+```py
+>>> billsum = billsum.train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> billsum["train"][0]
+{'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
+```
+
+使用するフィールドが 2 つあります。
+
+- `text`: モデルへの入力となる請求書のテキスト。
+- `summary`: モデルのターゲットとなる `text` の要約版。
+
+## Preprocess
+
+次のステップでは、T5 トークナイザーをロードして「text」と`summary`を処理します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+作成する前処理関数は次のことを行う必要があります。
+
+1. T5 がこれが要約タスクであることを認識できるように、入力の前にプロンプ​​トを付けます。複数の NLP タスクが可能な一部のモデルでは、特定のタスクのプロンプトが必要です。
+2. ラベルをトークン化するときにキーワード `text_target` 引数を使用します。
+3. `max_length`パラメータで設定された最大長を超えないようにシーケンスを切り詰めます。
+
+```py
+>>> prefix = "summarize: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + doc for doc in examples["text"]]
+...     model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
+
+...     labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
+
+...     model_inputs["labels"] = labels["input_ids"]
+...     return model_inputs
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] メソッドを使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` 関数を高速化できます。
+
+```py
+>>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
+```
+
+次に、[`DataCollat​​orForSeq2Seq`] を使用してサンプルのバッチを作成します。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) メトリックを読み込みます (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してください) ) メトリクスをロードして計算する方法の詳細については、次を参照してください)。
+
+```py
+>>> import evaluate
+
+>>> rouge = evaluate.load("rouge")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して ROUGE メトリクスを計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions, labels = eval_pred
+...     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
+...     result["gen_len"] = np.mean(prediction_lens)
+
+...     return {k: round(v, 4) for k, v in result.items()}
+```
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForSeq2SeqLM`] を使用して T5 をロードします。
+
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+この時点で残っている手順は次の 3 つだけです。
+
+1. [`Seq2SeqTrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は ROUGE メトリクスを評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数をモデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Seq2SeqTrainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_billsum_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=4,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_billsum["train"],
+...     eval_dataset=tokenized_billsum["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer, AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+次に、[`TFAutoModelForSeq2SeqLM`] を使用して T5 をロードできます。
+
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_billsum["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_billsum["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+トレーニングを開始する前にセットアップする最後の 2 つのことは、予測から ROUGE スコアを計算し、モデルをハブにプッシュする方法を提供することです。どちらも [Keras コールバック](../main_classes/keras_callbacks) を使用して行われます。
+
+`compute_metrics` 関数を [`~transformers.KerasMetricCallback`] に渡します。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+Specify where to push your model and tokenizer in the [`~transformers.PushToHubCallback`]:
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_billsum_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+次に、コールバックをまとめてバンドルします。
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+要約用にモデルを微調整する方法のより詳細な例については、対応するセクションを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb)。
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+要約したいテキストを考え出します。 T5 の場合、作業中のタスクに応じて入力に接頭辞を付ける必要があります。要約するには、以下に示すように入力にプレフィックスを付ける必要があります。
+
+```py
+>>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して要約用の `pipeline` をインスタンス化し、テキストをそれに渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model")
+>>> summarizer(text)
+[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
+```
+
+必要に応じて、`pipeline`」の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+Tokenize the text and return the `input_ids` as PyTorch tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</pt>
+<tf>
+
+テキストをトークン化し、`input_ids`を TensorFlow テンソルとして返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] メソッドを使用して要約を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model")
+>>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/text-to-speech.md b/docs/source/ja/tasks/text-to-speech.md
new file mode 100644
index 00000000000000..357ec18855149e
--- /dev/null
+++ b/docs/source/ja/tasks/text-to-speech.md
@@ -0,0 +1,638 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Text to speech
+
+[[open-in-colab]]
+
+テキスト読み上げ (TTS) は、テキストから自然な音声を作成するタスクです。音声は複数の形式で生成できます。
+言語と複数の話者向け。現在、いくつかのテキスト読み上げモデルが 🤗 Transformers で利用可能です。
+[Bark](../model_doc/bark)、[MMS](../model_doc/mms)、[VITS](../model_doc/vits)、および [SpeechT5](../model_doc/speecht5)。
+
+`text-to-audio`パイプライン (またはその別名 - `text-to-speech`) を使用して、音声を簡単に生成できます。 Bark などの一部のモデルは、
+笑い、ため息、泣きなどの非言語コミュニケーションを生成したり、音楽を追加したりするように条件付けすることもできます。
+Bark で`text-to-speech`パイプラインを使用する方法の例を次に示します。
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("text-to-speech", model="suno/bark-small")
+>>> text = "[clears throat] This is a test ... and I just took a long pause."
+>>> output = pipe(text)
+```
+
+ノートブックで結果の音声を聞くために使用できるコード スニペットを次に示します。
+
+```python
+>>> from IPython.display import Audio
+>>> Audio(output["audio"], rate=output["sampling_rate"])
+```
+Bark およびその他の事前トレーニングされた TTS モデルができることの詳細な例については、次のドキュメントを参照してください。
+[音声コース](https://huggingface.co/learn/audio-course/chapter6/pre-trained_models)。
+
+TTS モデルを微調整する場合、現在微調整できるのは SpeechT5 のみです。 SpeechT5 は、次の組み合わせで事前トレーニングされています。
+音声からテキストへのデータとテキストから音声へのデータ。両方のテキストに共有される隠された表現の統一された空間を学習できるようにします。
+そしてスピーチ。これは、同じ事前トレーニング済みモデルをさまざまなタスクに合わせて微調整できることを意味します。さらに、SpeechT5
+X ベクトル スピーカーの埋め込みを通じて複数のスピーカーをサポートします。
+
+このガイドの残りの部分では、次の方法を説明します。
+
+1. [VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) のオランダ語 (`nl`) 言語サブセット上の英語音声で元々トレーニングされた [SpeechT5](../model_doc/speecht5) を微調整します。 データセット。
+2. パイプラインを使用するか直接使用するかの 2 つの方法のいずれかで、洗練されたモデルを推論に使用します。
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+
+```bash
+pip install datasets soundfile speechbrain accelerate
+```
+
+SpeechT5 のすべての機能がまだ正式リリースにマージされていないため、ソースから 🤗Transformers をインストールします。
+
+```bash
+pip install git+https://github.com/huggingface/transformers.git
+```
+
+<Tip>
+
+このガイドに従うには、GPU が必要です。ノートブックで作業している場合は、次の行を実行して GPU が利用可能かどうかを確認します。
+
+```bash
+!nvidia-smi
+```
+
+</Tip>
+
+Hugging Face アカウントにログインして、モデルをアップロードしてコミュニティと共有することをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load the dataset
+
+[VoxPopuli](https://huggingface.co/datasets/facebook/voxpopuli) は、以下で構成される大規模な多言語音声コーパスです。
+データは 2009 年から 2020 年の欧州議会のイベント記録をソースとしています。 15 件分のラベル付き音声文字起こしデータが含まれています。
+ヨーロッパの言語。このガイドではオランダ語のサブセットを使用していますが、自由に別のサブセットを選択してください。
+
+VoxPopuli またはその他の自動音声認識 (ASR) データセットは最適ではない可能性があることに注意してください。
+TTS モデルをトレーニングするためのオプション。過剰なバックグラウンドノイズなど、ASR にとって有益となる機能は次のとおりです。
+通常、TTS では望ましくありません。ただし、最高品質、多言語、マルチスピーカーの TTS データセットを見つけるのは非常に困難な場合があります。
+挑戦的。
+
+データをロードしましょう:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("facebook/voxpopuli", "nl", split="train")
+>>> len(dataset)
+20968
+```
+
+微調整には 20968 個の例で十分です。 SpeechT5 はオーディオ データのサンプリング レートが 16 kHz であることを想定しているため、
+データセット内の例がこの要件を満たしていることを確認してください。
+
+
+```py
+dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+```
+
+## Preprocess the data
+
+使用するモデル チェックポイントを定義し、適切なプロセッサをロードすることから始めましょう。
+
+
+```py
+>>> from transformers import SpeechT5Processor
+
+>>> checkpoint = "microsoft/speecht5_tts"
+>>> processor = SpeechT5Processor.from_pretrained(checkpoint)
+```
+
+### Text cleanup for SpeechT5 tokenization 
+
+
+まずはテキストデータをクリーンアップすることから始めます。テキストを処理するには、プロセッサのトークナイザー部分が必要です。
+
+```py
+>>> tokenizer = processor.tokenizer
+```
+
+データセットの例には、`raw_text`機能と `normalized_text`機能が含まれています。テキスト入力としてどの機能を使用するかを決めるときは、
+SpeechT5 トークナイザーには数値のトークンがないことを考慮してください。 `normalized_text`には数字が書かれています
+テキストとして出力します。したがって、これはより適切であり、入力テキストとして `normalized_text` を使用することをお勧めします。
+
+SpeechT5 は英語でトレーニングされているため、オランダ語のデータセット内の特定の文字を認識しない可能性があります。もし
+残っているように、これらの文字は `<unk>`トークンに変換されます。ただし、オランダ語では、`à`などの特定の文字は
+音節を強調することに慣れています。テキストの意味を保持するために、この文字を通常の`a`に置き換えることができます。
+
+サポートされていないトークンを識別するには、`SpeechT5Tokenizer`を使用してデータセット内のすべての一意の文字を抽出します。
+文字をトークンとして扱います。これを行うには、以下を連結する `extract_all_chars` マッピング関数を作成します。
+すべての例からの転写を 1 つの文字列にまとめ、それを文字セットに変換します。
+すべての文字起こしが一度に利用できるように、`dataset.map()`で`b​​atched=True`と`batch_size=-1`を必ず設定してください。
+マッピング機能。
+
+```py
+>>> def extract_all_chars(batch):
+...     all_text = " ".join(batch["normalized_text"])
+...     vocab = list(set(all_text))
+...     return {"vocab": [vocab], "all_text": [all_text]}
+
+
+>>> vocabs = dataset.map(
+...     extract_all_chars,
+...     batched=True,
+...     batch_size=-1,
+...     keep_in_memory=True,
+...     remove_columns=dataset.column_names,
+... )
+
+>>> dataset_vocab = set(vocabs["vocab"][0])
+>>> tokenizer_vocab = {k for k, _ in tokenizer.get_vocab().items()}
+```
+
+これで、2 つの文字セットができました。1 つはデータセットの語彙を持ち、もう 1 つはトークナイザーの語彙を持ちます。
+データセット内でサポートされていない文字を特定するには、これら 2 つのセットの差分を取ることができます。結果として
+set には、データセットにはあるがトークナイザーには含まれていない文字が含まれます。
+
+```py
+>>> dataset_vocab - tokenizer_vocab
+{' ', 'à', 'ç', 'è', 'ë', 'í', 'ï', 'ö', 'ü'}
+```
+
+前の手順で特定されたサポートされていない文字を処理するには、これらの文字を
+有効なトークン。スペースはトークナイザーですでに `▁` に置き換えられているため、個別に処理する必要がないことに注意してください。
+
+```py
+>>> replacements = [
+...     ("à", "a"),
+...     ("ç", "c"),
+...     ("è", "e"),
+...     ("ë", "e"),
+...     ("í", "i"),
+...     ("ï", "i"),
+...     ("ö", "o"),
+...     ("ü", "u"),
+... ]
+
+
+>>> def cleanup_text(inputs):
+...     for src, dst in replacements:
+...         inputs["normalized_text"] = inputs["normalized_text"].replace(src, dst)
+...     return inputs
+
+
+>>> dataset = dataset.map(cleanup_text)
+```
+
+テキスト内の特殊文字を扱ったので、今度は音声データに焦点を移します。
+
+### Speakers
+
+VoxPopuli データセットには複数の話者の音声が含まれていますが、データセットには何人の話者が含まれているのでしょうか?に
+これを決定すると、一意の話者の数と、各話者がデータセットに寄与する例の数を数えることができます。
+データセットには合計 20,968 個の例が含まれており、この情報により、分布をより深く理解できるようになります。
+講演者とデータ内の例。
+
+```py
+>>> from collections import defaultdict
+
+>>> speaker_counts = defaultdict(int)
+
+>>> for speaker_id in dataset["speaker_id"]:
+...     speaker_counts[speaker_id] += 1
+```
+
+ヒストグラムをプロットすると、各話者にどれだけのデータがあるかを把握できます。
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.hist(speaker_counts.values(), bins=20)
+>>> plt.ylabel("Speakers")
+>>> plt.xlabel("Examples")
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_speakers_histogram.png" alt="Speakers histogram"/>
+</div>
+
+ヒストグラムから、データセット内の話者の約 3 分の 1 の例が 100 未満であることがわかります。
+約 10 人の講演者が 500 以上の例を持っています。トレーニング効率を向上させ、データセットのバランスをとるために、次のことを制限できます。
+100 ～ 400 個の例を含むデータを講演者に提供します。
+
+```py
+>>> def select_speaker(speaker_id):
+...     return 100 <= speaker_counts[speaker_id] <= 400
+
+
+>>> dataset = dataset.filter(select_speaker, input_columns=["speaker_id"])
+```
+
+残りのスピーカーの数を確認してみましょう。
+
+```py
+>>> len(set(dataset["speaker_id"]))
+42
+```
+
+残りの例がいくつあるか見てみましょう。
+
+
+```py
+>>> len(dataset)
+9973
+```
+
+約 40 人のユニークな講演者からの 10,000 弱の例が残りますが、これで十分です。
+
+例が少ないスピーカーの中には、例が長い場合、実際にはより多くの音声が利用できる場合があることに注意してください。しかし、
+各話者の音声の合計量を決定するには、データセット全体をスキャンする必要があります。
+各オーディオ ファイルのロードとデコードを伴う時間のかかるプロセス。そのため、ここではこのステップをスキップすることにしました。
+
+### Speaker embeddings
+
+TTS モデルが複数のスピーカーを区別できるようにするには、サンプルごとにスピーカーの埋め込みを作成する必要があります。
+スピーカーの埋め込みは、特定のスピーカーの音声特性をキャプチャするモデルへの追加入力です。
+これらのスピーカー埋め込みを生成するには、事前トレーニングされた [spkrec-xvect-voxceleb](https://huggingface.co/speechbrain/spkrec-xvect-voxceleb) を使用します。
+SpeechBrain のモデル。
+
+入力オーディオ波形を受け取り、512 要素のベクトルを出力する関数 `create_speaker_embedding()` を作成します。
+対応するスピーカー埋め込みが含まれます。
+
+
+```py
+>>> import os
+>>> import torch
+>>> from speechbrain.pretrained import EncoderClassifier
+
+>>> spk_model_name = "speechbrain/spkrec-xvect-voxceleb"
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> speaker_model = EncoderClassifier.from_hparams(
+...     source=spk_model_name,
+...     run_opts={"device": device},
+...     savedir=os.path.join("/tmp", spk_model_name),
+... )
+
+
+>>> def create_speaker_embedding(waveform):
+...     with torch.no_grad():
+...         speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform))
+...         speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
+...         speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy()
+...     return speaker_embeddings
+```
+
+`speechbrain/spkrec-xvect-voxceleb`モデルは、VoxCeleb からの英語音声でトレーニングされたことに注意することが重要です。
+データセットですが、このガイドのトレーニング例はオランダ語です。このモデルは今後も生成されると信じていますが、
+オランダ語のデータセットに適切な話者埋め込みを行っても、この仮定はすべての場合に当てはまらない可能性があります。
+
+最適な結果を得るには、最初にターゲット音声で X ベクトル モデルをトレーニングすることをお勧めします。これにより、モデルが確実に
+オランダ語に存在する独特の音声特徴をよりよく捉えることができます。
+
+### Processing the dataset
+
+最後に、モデルが期待する形式にデータを処理しましょう。を取り込む `prepare_dataset` 関数を作成します。
+これは 1 つの例であり、`SpeechT5Processor` オブジェクトを使用して入力テキストをトークン化し、ターゲット オーディオをログメル スペクトログラムにロードします。
+また、追加の入力としてスピーカーの埋め込みも追加する必要があります。
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example = processor(
+...         text=example["normalized_text"],
+...         audio_target=audio["array"],
+...         sampling_rate=audio["sampling_rate"],
+...         return_attention_mask=False,
+...     )
+
+...     # strip off the batch dimension
+...     example["labels"] = example["labels"][0]
+
+...     # use SpeechBrain to obtain x-vector
+...     example["speaker_embeddings"] = create_speaker_embedding(audio["array"])
+
+...     return example
+```
+
+単一の例を見て、処理が正しいことを確認します。
+
+```py
+>>> processed_example = prepare_dataset(dataset[0])
+>>> list(processed_example.keys())
+['input_ids', 'labels', 'stop_labels', 'speaker_embeddings']
+```
+スピーカーのエンベディングは 512 要素のベクトルである必要があります。
+
+```py
+>>> processed_example["speaker_embeddings"].shape
+(512,)
+```
+
+ラベルは、80 メル ビンを含むログメル スペクトログラムである必要があります。
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> plt.figure()
+>>> plt.imshow(processed_example["labels"].T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_1.png" alt="Log-mel spectrogram with 80 mel bins"/>
+</div>
+
+補足: このスペクトログラムがわかりにくいと感じる場合は、低周波を配置する規則に慣れていることが原因である可能性があります。
+プロットの下部に高周波、上部に高周波が表示されます。ただし、matplotlib ライブラリを使用してスペクトログラムを画像としてプロットする場合、
+Y 軸が反転され、スペクトログラムが上下逆に表示されます。
+
+次に、処理関数をデータセット全体に適用します。これには 5 ～ 10 分かかります。
+
+```py
+>>> dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names)
+```
+
+データセット内の一部の例が、モデルが処理できる最大入力長 (600 トークン) を超えていることを示す警告が表示されます。
+それらの例をデータセットから削除します。ここではさらに進んで、より大きなバッチ サイズを可能にするために、200 トークンを超えるものはすべて削除します。
+
+```py
+>>> def is_not_too_long(input_ids):
+...     input_length = len(input_ids)
+...     return input_length < 200
+
+
+>>> dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"])
+>>> len(dataset)
+8259
+```
+
+次に、基本的なトレーニング/テスト分割を作成します。
+
+```py
+>>> dataset = dataset.train_test_split(test_size=0.1)
+```
+
+### Data collator
+
+複数の例を 1 つのバッチに結合するには、カスタム データ照合器を定義する必要があります。このコレーターは、短いシーケンスをパディングで埋め込みます。
+トークンを使用して、すべての例が同じ長さになるようにします。スペクトログラム ラベルの場合、埋め込まれた部分は特別な値 `-100` に置き換えられます。この特別な価値は
+スペクトログラム損失を計算するときに、スペクトログラムのその部分を無視するようにモデルに指示します。
+
+```py
+>>> from dataclasses import dataclass
+>>> from typing import Any, Dict, List, Union
+
+
+>>> @dataclass
+... class TTSDataCollatorWithPadding:
+...     processor: Any
+
+...     def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
+...         input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
+...         label_features = [{"input_values": feature["labels"]} for feature in features]
+...         speaker_features = [feature["speaker_embeddings"] for feature in features]
+
+...         # collate the inputs and targets into a batch
+...         batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt")
+
+...         # replace padding with -100 to ignore loss correctly
+...         batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100)
+
+...         # not used during fine-tuning
+...         del batch["decoder_attention_mask"]
+
+...         # round down target lengths to multiple of reduction factor
+...         if model.config.reduction_factor > 1:
+...             target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features])
+...             target_lengths = target_lengths.new(
+...                 [length - length % model.config.reduction_factor for length in target_lengths]
+...             )
+...             max_length = max(target_lengths)
+...             batch["labels"] = batch["labels"][:, :max_length]
+
+...         # also add in the speaker embeddings
+...         batch["speaker_embeddings"] = torch.tensor(speaker_features)
+
+...         return batch
+```
+
+SpeechT5 では、モデルのデコーダ部分への入力が 2 分の 1 に削減されます。つまり、すべてのデータが破棄されます。
+ターゲット シーケンスからの他のタイムステップ。次に、デコーダは 2 倍の長さのシーケンスを予測します。オリジナル以来
+ターゲット シーケンスの長さが奇数である可能性がある場合、データ照合機能はバッチの最大長を切り捨てて、
+2の倍数。
+
+```py 
+>>> data_collator = TTSDataCollatorWithPadding(processor=processor)
+```
+
+## Train the model
+
+プロセッサのロードに使用したのと同じチェックポイントから事前トレーニングされたモデルをロードします。
+
+```py
+>>> from transformers import SpeechT5ForTextToSpeech
+
+>>> model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
+```
+
+`use_cache=True`オプションは、勾配チェックポイントと互換性がありません。トレーニングのために無効にします。
+
+```py 
+>>> model.config.use_cache = False
+```
+
+トレーニング引数を定義します。ここでは、トレーニング プロセス中に評価メトリクスを計算していません。代わりに、
+損失だけを見てください。
+
+```python
+>>> from transformers import Seq2SeqTrainingArguments
+
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="speecht5_finetuned_voxpopuli_nl",  # change to a repo name of your choice
+...     per_device_train_batch_size=4,
+...     gradient_accumulation_steps=8,
+...     learning_rate=1e-5,
+...     warmup_steps=500,
+...     max_steps=4000,
+...     gradient_checkpointing=True,
+...     fp16=True,
+...     evaluation_strategy="steps",
+...     per_device_eval_batch_size=2,
+...     save_steps=1000,
+...     eval_steps=1000,
+...     logging_steps=25,
+...     report_to=["tensorboard"],
+...     load_best_model_at_end=True,
+...     greater_is_better=False,
+...     label_names=["labels"],
+...     push_to_hub=True,
+... )
+```
+
+`Trainer`オブジェクトをインスタンス化し、モデル、データセット、データ照合器をそれに渡します。
+
+```py
+>>> from transformers import Seq2SeqTrainer
+
+>>> trainer = Seq2SeqTrainer(
+...     args=training_args,
+...     model=model,
+...     train_dataset=dataset["train"],
+...     eval_dataset=dataset["test"],
+...     data_collator=data_collator,
+...     tokenizer=processor,
+... )
+```
+これで、トレーニングを開始する準備が整いました。トレーニングには数時間かかります。 GPU に応じて、
+トレーニングを開始するときに、CUDA の「メモリ不足」エラーが発生する可能性があります。この場合、減らすことができます
+`per_device_train_batch_size`を 2 倍に増分し、`gradient_accumulation_steps`を 2 倍に増やして補正します。
+
+```py
+>>> trainer.train()
+```
+パイプラインでチェックポイントを使用できるようにするには、必ずプロセッサをチェックポイントとともに保存してください。
+
+```py
+>>> processor.save_pretrained("YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl")
+```
+
+最終モデルを 🤗 ハブにプッシュします。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+### Inference with a pipeline
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+まず、対応するパイプラインでそれを使用する方法を見てみましょう。 `"text-to-speech"` パイプラインを作成しましょう
+チェックポイント:
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("text-to-speech", model="YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl")
+```
+
+ナレーションを希望するオランダ語のテキストを選択してください。例:
+
+```py
+>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
+```
+
+パイプラインで SpeechT5 を使用するには、スピーカーの埋め込みが必要です。テスト データセットの例から取得してみましょう。
+
+```py
+>>> example = dataset["test"][304]
+>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+```
+
+これで、テキストとスピーカーの埋め込みをパイプラインに渡すことができ、残りはパイプラインが処理します。
+
+
+```py
+>>> forward_params = {"speaker_embeddings": speaker_embeddings}
+>>> output = pipe(text, forward_params=forward_params)
+>>> output
+{'audio': array([-6.82714235e-05, -4.26525949e-04,  1.06134125e-04, ...,
+        -1.22392643e-03, -7.76011671e-04,  3.29112721e-04], dtype=float32),
+ 'sampling_rate': 16000}
+```
+
+その後、結果を聞くことができます。
+
+
+```py
+>>> from IPython.display import Audio
+>>> Audio(output['audio'], rate=output['sampling_rate']) 
+```
+
+### Run inference manually
+
+パイプラインを使用しなくても同じ推論結果を得ることができますが、より多くの手順が必要になります。
+
+🤗 ハブからモデルをロードします。
+
+
+```py
+>>> model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl")
+```
+
+テスト データセットから例を選択して、スピーカーの埋め込みを取得します。
+
+```py 
+>>> example = dataset["test"][304]
+>>> speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0)
+```
+
+入力テキストを定義し、トークン化します。
+
+```py 
+>>> text = "hallo allemaal, ik praat nederlands. groetjes aan iedereen!"
+>>> inputs = processor(text=text, return_tensors="pt")
+```
+
+モデルを使用してスペクトログラムを作成します。
+
+```py
+>>> spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
+```
+
+次のことを行う場合は、スペクトログラムを視覚化します。
+
+```py
+>>> plt.figure()
+>>> plt.imshow(spectrogram.T)
+>>> plt.show()
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/tts_logmelspectrogram_2.png" alt="Generated log-mel spectrogram"/>
+</div>
+
+最後に、ボコーダーを使用してスペクトログラムをサウンドに変換します。
+
+```py
+>>> with torch.no_grad():
+...     speech = vocoder(spectrogram)
+
+>>> from IPython.display import Audio
+
+>>> Audio(speech.numpy(), rate=16000)
+```
+
+私たちの経験では、このモデルから満足のいく結果を得るのは難しい場合があります。スピーカーの品質
+埋め込みは重要な要素であるようです。 SpeechT5 は英語の x ベクトルで事前トレーニングされているため、最高のパフォーマンスを発揮します
+英語スピーカーの埋め込みを使用する場合。合成音声の音質が悪い場合は、別のスピーカー埋め込みを使用してみてください。
+
+トレーニング期間を長くすると、結果の質も向上する可能性があります。それでも、そのスピーチは明らかに英語ではなくオランダ語です。
+話者の音声特性をキャプチャします (例の元の音声と比較)。
+もう 1 つ実験すべきことは、モデルの構成です。たとえば、`config.reduction_factor = 1`を使用してみてください。
+これにより結果が改善されるかどうかを確認してください。
+
+最後に、倫理的配慮を考慮することが不可欠です。 TTS テクノロジーには数多くの有用な用途がありますが、
+また、知らないうちに誰かの声を偽装するなど、悪意のある目的に使用される可能性もあります。お願いします
+TTS は賢明かつ責任を持って使用してください。
diff --git a/docs/source/ja/tasks/token_classification.md b/docs/source/ja/tasks/token_classification.md
new file mode 100644
index 00000000000000..a4b759d6b5b3b7
--- /dev/null
+++ b/docs/source/ja/tasks/token_classification.md
@@ -0,0 +1,565 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Token classification
+
+[[open-in-colab]]
+
+<Youtube id="wVHdVlPScxA"/>
+
+トークン分類では、文内の個々のトークンにラベルを割り当てます。最も一般的なトークン分類タスクの 1 つは、固有表現認識 (NER) です。 NER は、人、場所、組織など、文内の各エンティティのラベルを見つけようとします。
+
+このガイドでは、次の方法を説明します。
+
+1. [WNUT 17](https://huggingface.co/datasets/wnut_17) データセットで [DistilBERT](https://huggingface.co/distilbert-base-uncased) を微調整して、新しいエンティティを検出します。
+2. 微調整されたモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[ALBERT](../model_doc/albert), [BERT](../model_doc/bert), [BigBird](../model_doc/big_bird), [BioGpt](../model_doc/biogpt), [BLOOM](../model_doc/bloom), [BROS](../model_doc/bros), [CamemBERT](../model_doc/camembert), [CANINE](../model_doc/canine), [ConvBERT](../model_doc/convbert), [Data2VecText](../model_doc/data2vec-text), [DeBERTa](../model_doc/deberta), [DeBERTa-v2](../model_doc/deberta-v2), [DistilBERT](../model_doc/distilbert), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [ErnieM](../model_doc/ernie_m), [ESM](../model_doc/esm), [Falcon](../model_doc/falcon), [FlauBERT](../model_doc/flaubert), [FNet](../model_doc/fnet), [Funnel Transformer](../model_doc/funnel), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [I-BERT](../model_doc/ibert), [LayoutLM](../model_doc/layoutlm), [LayoutLMv2](../model_doc/layoutlmv2), [LayoutLMv3](../model_doc/layoutlmv3), [LiLT](../model_doc/lilt), [Longformer](../model_doc/longformer), [LUKE](../model_doc/luke), [MarkupLM](../model_doc/markuplm), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [MobileBERT](../model_doc/mobilebert), [MPNet](../model_doc/mpnet), [MPT](../model_doc/mpt), [MRA](../model_doc/mra), [Nezha](../model_doc/nezha), [Nyströmformer](../model_doc/nystromformer), [QDQBert](../model_doc/qdqbert), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [SqueezeBERT](../model_doc/squeezebert), [XLM](../model_doc/xlm), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod), [YOSO](../model_doc/yoso)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate seqeval
+```
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load WNUT 17 dataset
+
+まず、🤗 データセット ライブラリから WNUT 17 データセットをロードします。
+
+```py
+>>> from datasets import load_dataset
+
+>>> wnut = load_dataset("wnut_17")
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> wnut["train"][0]
+{'id': '0',
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
+}
+```
+
+`ner_tags`内の各数字はエンティティを表します。数値をラベル名に変換して、エンティティが何であるかを調べます。
+
+```py
+>>> label_list = wnut["train"].features[f"ner_tags"].feature.names
+>>> label_list
+[
+    "O",
+    "B-corporation",
+    "I-corporation",
+    "B-creative-work",
+    "I-creative-work",
+    "B-group",
+    "I-group",
+    "B-location",
+    "I-location",
+    "B-person",
+    "I-person",
+    "B-product",
+    "I-product",
+]
+```
+各 `ner_tag` の前に付く文字は、エンティティのトークンの位置を示します。
+
+- `B-` はエンティティの始まりを示します。
+- `I-` は、トークンが同じエンティティ内に含まれていることを示します (たとえば、`State` トークンは次のようなエンティティの一部です)
+  `Empire State Building`）。
+- `0` は、トークンがどのエンティティにも対応しないことを示します。
+
+## Preprocess
+
+<Youtube id="iY2AZYdZAr0"/>
+
+次のステップでは、DistilBERT トークナイザーをロードして`tokens`フィールドを前処理します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+上の `tokens`フィールドの例で見たように、入力はすでにトークン化されているようです。しかし、実際には入力はまだトークン化されていないため、単語をサブワードにトークン化するには`is_split_into_words=True` を設定する必要があります。例えば：
+
+```py
+>>> example = wnut["train"][0]
+>>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
+>>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
+>>> tokens
+['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
+```
+
+ただし、これによりいくつかの特別なトークン `[CLS]` と `[SEP]` が追加され、サブワードのトークン化により入力とラベルの間に不一致が生じます。 1 つのラベルに対応する 1 つの単語を 2 つのサブワードに分割できるようになりました。次の方法でトークンとラベルを再調整する必要があります。
+
+1. [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids) メソッドを使用して、すべてのトークンを対応する単語にマッピングします。
+2. 特別なトークン `[CLS]` と `[SEP]` にラベル `-100` を割り当て、それらが PyTorch 損失関数によって無視されるようにします ([CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html))。
+3. 特定の単語の最初のトークンのみにラベルを付けます。同じ単語の他のサブトークンに `-100`を割り当てます。
+
+トークンとラベルを再調整し、シーケンスを DistilBERT の最大入力長以下に切り詰める関数を作成する方法を次に示します。
+
+```py
+>>> def tokenize_and_align_labels(examples):
+...     tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
+
+...     labels = []
+...     for i, label in enumerate(examples[f"ner_tags"]):
+...         word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
+...         previous_word_idx = None
+...         label_ids = []
+...         for word_idx in word_ids:  # Set the special tokens to -100.
+...             if word_idx is None:
+...                 label_ids.append(-100)
+...             elif word_idx != previous_word_idx:  # Only label the first token of a given word.
+...                 label_ids.append(label[word_idx])
+...             else:
+...                 label_ids.append(-100)
+...             previous_word_idx = word_idx
+...         labels.append(label_ids)
+
+...     tokenized_inputs["labels"] = labels
+...     return tokenized_inputs
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] 関数を使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` 関数を高速化できます。
+
+```py
+>>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
+```
+
+次に、[`DataCollat​​orWithPadding`] を使用してサンプルのバッチを作成します。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForTokenClassification
+
+>>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) フレームワークを読み込みます (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してください) ) メトリクスの読み込みと計算の方法について詳しくは、こちらをご覧ください)。 Seqeval は実際に、精度、再現率、F1、精度などのいくつかのスコアを生成します。
+
+```py
+>>> import evaluate
+
+>>> seqeval = evaluate.load("seqeval")
+```
+
+まず NER ラベルを取得してから、真の予測と真のラベルを [`~evaluate.EvaluationModule.compute`] に渡してスコアを計算する関数を作成します。
+
+```py
+>>> import numpy as np
+
+>>> labels = [label_list[i] for i in example[f"ner_tags"]]
+
+
+>>> def compute_metrics(p):
+...     predictions, labels = p
+...     predictions = np.argmax(predictions, axis=2)
+
+...     true_predictions = [
+...         [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+...     true_labels = [
+...         [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
+...         for prediction, label in zip(predictions, labels)
+...     ]
+
+...     results = seqeval.compute(predictions=true_predictions, references=true_labels)
+...     return {
+...         "precision": results["overall_precision"],
+...         "recall": results["overall_recall"],
+...         "f1": results["overall_f1"],
+...         "accuracy": results["overall_accuracy"],
+...     }
+```
+
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+モデルのトレーニングを開始する前に、`id2label`と`label2id`を使用して、予想される ID とそのラベルのマップを作成します。
+```py
+>>> id2label = {
+...     0: "O",
+...     1: "B-corporation",
+...     2: "I-corporation",
+...     3: "B-creative-work",
+...     4: "I-creative-work",
+...     5: "B-group",
+...     6: "I-group",
+...     7: "B-location",
+...     8: "I-location",
+...     9: "B-person",
+...     10: "I-person",
+...     11: "B-product",
+...     12: "I-product",
+... }
+>>> label2id = {
+...     "O": 0,
+...     "B-corporation": 1,
+...     "I-corporation": 2,
+...     "B-creative-work": 3,
+...     "I-creative-work": 4,
+...     "B-group": 5,
+...     "I-group": 6,
+...     "B-location": 7,
+...     "I-location": 8,
+...     "B-person": 9,
+...     "I-person": 10,
+...     "B-product": 11,
+...     "I-product": 12,
+... }
+```
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForTokenClassification`] を使用して、予期されるラベルの数とラベル マッピングを指定して DistilBERT を読み込みます。
+
+```py
+>>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+
+>>> model = AutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+
+この時点で残っているステップは 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は連続スコアを評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数を、モデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Trainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_wnut_model",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     num_train_epochs=2,
+...     weight_decay=0.01,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     load_best_model_at_end=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_wnut["train"],
+...     eval_dataset=tokenized_wnut["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+```py
+>>> from transformers import create_optimizer
+
+>>> batch_size = 16
+>>> num_train_epochs = 3
+>>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
+>>> optimizer, lr_schedule = create_optimizer(
+...     init_lr=2e-5,
+...     num_train_steps=num_train_steps,
+...     weight_decay_rate=0.01,
+...     num_warmup_steps=0,
+... )
+```
+次に、[`TFAutoModelForTokenClassification`] を使用して、予期されるラベルの数とラベル マッピングを指定して DistilBERT をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained(
+...     "distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
+... )
+```
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_wnut["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_validation_set = model.prepare_tf_dataset(
+...     tokenized_wnut["validation"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+
+トレーニングを開始する前にセットアップする最後の 2 つのことは、予測から連続スコアを計算することと、モデルをハブにプッシュする方法を提供することです。どちらも [Keras コールバック](../main_classes/keras_callbacks) を使用して行われます。
+
+`compute_metrics` 関数を [`~transformers.KerasMetricCallback`] に渡します。
+
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`] でモデルとトークナイザーをプッシュする場所を指定します。
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_wnut_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+次に、コールバックをまとめてバンドルします。
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+トークン分類のモデルを微調整する方法のより詳細な例については、対応するセクションを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)。
+
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論を実行したいテキストをいくつか取得します。
+
+```py
+>>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して NER の`pipeline`をインスタンス化し、テキストをそれに渡します。
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
+>>> classifier(text)
+[{'entity': 'B-location',
+  'score': 0.42658573,
+  'index': 2,
+  'word': 'golden',
+  'start': 4,
+  'end': 10},
+ {'entity': 'I-location',
+  'score': 0.35856336,
+  'index': 3,
+  'word': 'state',
+  'start': 11,
+  'end': 16},
+ {'entity': 'B-group',
+  'score': 0.3064001,
+  'index': 4,
+  'word': 'warriors',
+  'start': 17,
+  'end': 25},
+ {'entity': 'B-location',
+  'score': 0.65523505,
+  'index': 13,
+  'word': 'san',
+  'start': 80,
+  'end': 83},
+ {'entity': 'B-location',
+  'score': 0.4668663,
+  'index': 14,
+  'word': 'francisco',
+  'start': 84,
+  'end': 93}]
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+テキストをトークン化して PyTorch テンソルを返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="pt")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+```py
+>>> from transformers import AutoModelForTokenClassification
+
+>>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+最も高い確率でクラスを取得し、モデルの `id2label` マッピングを使用してそれをテキスト ラベルに変換します。
+
+```py
+>>> predictions = torch.argmax(logits, dim=2)
+>>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+
+</pt>
+<tf>
+
+テキストをトークン化し、TensorFlow テンソルを返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> inputs = tokenizer(text, return_tensors="tf")
+```
+
+入力をモデルに渡し、`logits`を返します。
+
+
+```py
+>>> from transformers import TFAutoModelForTokenClassification
+
+>>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
+>>> logits = model(**inputs).logits
+```
+
+最も高い確率でクラスを取得し、モデルの `id2label` マッピングを使用してそれをテキスト ラベルに変換します。
+
+```py
+>>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
+>>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
+>>> predicted_token_class
+['O',
+ 'O',
+ 'B-location',
+ 'I-location',
+ 'B-group',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'O',
+ 'B-location',
+ 'B-location',
+ 'O',
+ 'O']
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/translation.md b/docs/source/ja/tasks/translation.md
new file mode 100644
index 00000000000000..9004a87fcbfff6
--- /dev/null
+++ b/docs/source/ja/tasks/translation.md
@@ -0,0 +1,417 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Translation
+
+[[open-in-colab]]
+
+<Youtube id="1JvfrvZgi6c"/>
+
+翻訳では、一連のテキストをある言語から別の言語に変換します。これは、シーケンス間問題として定式化できるいくつかのタスクの 1 つであり、翻訳や要約など、入力から何らかの出力を返すための強力なフレームワークです。翻訳システムは通常、異なる言語のテキスト間の翻訳に使用されますが、音声、またはテキストから音声への変換や音声からテキストへの変換など、音声間の組み合わせにも使用できます。
+
+このガイドでは、次の方法を説明します。
+
+1. [OPUS Books](https://huggingface.co/datasets/opus_books) データセットの英語-フランス語サブセットの [T5](https://huggingface.co/t5-small) を微調整して、英語のテキストを次の形式に翻訳します。フランス語。
+2. 微調整されたモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install transformers datasets evaluate sacrebleu
+```
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+## Load OPUS Books dataset
+
+まず、🤗 データセット ライブラリから [OPUS Books](https://huggingface.co/datasets/opus_books) データセットの英語とフランス語のサブセットを読み込みます。
+
+```py
+>>> from datasets import load_dataset
+
+>>> books = load_dataset("opus_books", "en-fr")
+```
+
+[`~datasets.Dataset.train_test_split`] メソッドを使用して、データセットをトレイン セットとテスト セットに分割します。
+
+
+```py
+>>> books = books["train"].train_test_split(test_size=0.2)
+```
+
+次に、例を見てみましょう。
+
+```py
+>>> books["train"][0]
+{'id': '90560',
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
+  'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
+```
+
+`translation`: テキストの英語とフランス語の翻訳。
+
+## Preprocess
+
+<Youtube id="XAR8jnZZuUs"/>
+
+次のステップでは、T5 トークナイザーをロードして英語とフランス語の言語ペアを処理します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> checkpoint = "t5-small"
+>>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
+```
+
+作成する前処理関数は次のことを行う必要があります。
+
+1. T5 がこれが翻訳タスクであることを認識できるように、入力の前にプロンプ​​トを付けます。複数の NLP タスクが可能な一部のモデルでは、特定のタスクのプロンプトが必要です。
+2. 英語の語彙で事前トレーニングされたトークナイザーを使用してフランス語のテキストをトークン化することはできないため、入力 (英語) とターゲット (フランス語) を別々にトークン化します。
+3. `max_length`パラメータで設定された最大長を超えないようにシーケンスを切り詰めます。
+```py
+>>> source_lang = "en"
+>>> target_lang = "fr"
+>>> prefix = "translate English to French: "
+
+
+>>> def preprocess_function(examples):
+...     inputs = [prefix + example[source_lang] for example in examples["translation"]]
+...     targets = [example[target_lang] for example in examples["translation"]]
+...     model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
+...     return model_inputs
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.Dataset.map`] メソッドを使用します。 `batched=True` を設定してデータセットの複数の要素を一度に処理することで、`map` 関数を高速化できます。
+
+```py
+>>> tokenized_books = books.map(preprocess_function, batched=True)
+```
+
+次に、[`DataCollat​​orForSeq2Seq`] を使用してサンプルのバッチを作成します。データセット全体を最大長までパディングするのではなく、照合中にバッチ内の最長の長さまで文を *動的にパディング* する方が効率的です。
+
+<frameworkcontent>
+<pt>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
+```
+</pt>
+<tf>
+
+```py
+>>> from transformers import DataCollatorForSeq2Seq
+
+>>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
+```
+</tf>
+</frameworkcontent>
+
+## Evaluate
+
+トレーニング中にメトリクスを含めると、多くの場合、モデルのパフォーマンスを評価するのに役立ちます。 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) ライブラリを使用して、評価メソッドをすばやくロードできます。このタスクでは、[SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) メトリクスをロードします (🤗 Evaluate [クイック ツアー](https://huggingface.co/docs/evaluate/a_quick_tour) を参照してください) ) メトリクスの読み込みと計算方法の詳細については、次を参照してください)。
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("sacrebleu")
+```
+
+次に、予測とラベルを [`~evaluate.EvaluationModule.compute`] に渡して SacreBLEU スコアを計算する関数を作成します。
+```py
+>>> import numpy as np
+
+
+>>> def postprocess_text(preds, labels):
+...     preds = [pred.strip() for pred in preds]
+...     labels = [[label.strip()] for label in labels]
+
+...     return preds, labels
+
+
+>>> def compute_metrics(eval_preds):
+...     preds, labels = eval_preds
+...     if isinstance(preds, tuple):
+...         preds = preds[0]
+...     decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
+
+...     labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
+...     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
+
+...     decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
+
+...     result = metric.compute(predictions=decoded_preds, references=decoded_labels)
+...     result = {"bleu": result["score"]}
+
+...     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
+...     result["gen_len"] = np.mean(prediction_lens)
+...     result = {k: round(v, 4) for k, v in result.items()}
+...     return result
+```
+これで`compute_metrics`関数の準備が整いました。トレーニングをセットアップするときにこの関数に戻ります。
+
+## Train
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+[`Trainer`] を使用したモデルの微調整に慣れていない場合は、[ここ](../training#train-with-pytorch-trainer) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+
+これでモデルのトレーニングを開始する準備が整いました。 [`AutoModelForSeq2SeqLM`] を使用して T5 をロードします。
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+この時点で残っているステップは 3 つだけです。
+
+1. [`Seq2SeqTrainingArguments`] でトレーニング ハイパーパラメータを定義します。唯一の必須パラメータは、モデルの保存場所を指定する `output_dir` です。 `push_to_hub=True`を設定して、このモデルをハブにプッシュします (モデルをアップロードするには、Hugging Face にサインインする必要があります)。各エポックの終了時に、[`Trainer`] は SacreBLEU メトリクスを評価し、トレーニング チェックポイントを保存します。
+2. トレーニング引数をモデル、データセット、トークナイザー、データ照合器、および `compute_metrics` 関数とともに [`Seq2SeqTrainer`] に渡します。
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+
+```py
+>>> training_args = Seq2SeqTrainingArguments(
+...     output_dir="my_awesome_opus_books_model",
+...     evaluation_strategy="epoch",
+...     learning_rate=2e-5,
+...     per_device_train_batch_size=16,
+...     per_device_eval_batch_size=16,
+...     weight_decay=0.01,
+...     save_total_limit=3,
+...     num_train_epochs=2,
+...     predict_with_generate=True,
+...     fp16=True,
+...     push_to_hub=True,
+... )
+
+>>> trainer = Seq2SeqTrainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=tokenized_books["train"],
+...     eval_dataset=tokenized_books["test"],
+...     tokenizer=tokenizer,
+...     data_collator=data_collator,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+<tf>
+<Tip>
+
+Keras を使用したモデルの微調整に慣れていない場合は、[こちら](../training#train-a-tensorflow-model-with-keras) の基本的なチュートリアルをご覧ください。
+
+</Tip>
+TensorFlow でモデルを微調整するには、オプティマイザー関数、学習率スケジュール、およびいくつかのトレーニング ハイパーパラメーターをセットアップすることから始めます。
+
+
+```py
+>>> from transformers import AdamWeightDecay
+
+>>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
+```
+
+次に、[`TFAutoModelForSeq2SeqLM`] を使用して T5 をロードできます。
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
+```
+
+[`~transformers.TFPreTrainedModel.prepare_tf_dataset`] を使用して、データセットを `tf.data.Dataset` 形式に変換します。
+
+```py
+>>> tf_train_set = model.prepare_tf_dataset(
+...     tokenized_books["train"],
+...     shuffle=True,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+
+>>> tf_test_set = model.prepare_tf_dataset(
+...     tokenized_books["test"],
+...     shuffle=False,
+...     batch_size=16,
+...     collate_fn=data_collator,
+... )
+```
+
+[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) を使用してトレーニング用のモデルを設定します。 Transformers モデルにはすべてデフォルトのタスク関連の損失関数があるため、次の場合を除き、損失関数を指定する必要はないことに注意してください。
+
+```py
+>>> import tensorflow as tf
+
+>>> model.compile(optimizer=optimizer)  # No loss argument!
+```
+トレーニングを開始する前にセットアップする最後の 2 つのことは、予測から SacreBLEU メトリクスを計算し、モデルをハブにプッシュする方法を提供することです。どちらも [Keras コールバック](../main_classes/keras_callbacks) を使用して行われます。
+
+`compute_metrics` 関数を [`~transformers.KerasMetricCallback`] に渡します。
+
+```py
+>>> from transformers.keras_callbacks import KerasMetricCallback
+
+>>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
+```
+
+[`~transformers.PushToHubCallback`] でモデルとトークナイザーをプッシュする場所を指定します。
+
+```py
+>>> from transformers.keras_callbacks import PushToHubCallback
+
+>>> push_to_hub_callback = PushToHubCallback(
+...     output_dir="my_awesome_opus_books_model",
+...     tokenizer=tokenizer,
+... )
+```
+
+次に、コールバックをまとめてバンドルします。
+
+```py
+>>> callbacks = [metric_callback, push_to_hub_callback]
+```
+
+ついに、モデルのトレーニングを開始する準備が整いました。トレーニングおよび検証データセット、エポック数、コールバックを指定して [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) を呼び出し、モデルを微調整します。
+
+```py
+>>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
+```
+
+トレーニングが完了すると、モデルは自動的にハブにアップロードされ、誰でも使用できるようになります。
+
+</tf>
+</frameworkcontent>
+
+<Tip>
+
+翻訳用にモデルを微調整する方法の詳細な例については、対応するドキュメントを参照してください。
+[PyTorch ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb)
+または [TensorFlow ノートブック](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb)。
+
+</Tip>
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+別の言語に翻訳したいテキストを考え出します。 T5 の場合、作業中のタスクに応じて入力に接頭辞を付ける必要があります。英語からフランス語に翻訳する場合は、以下に示すように入力に接頭辞を付ける必要があります。
+
+```py
+>>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
+```
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`] で使用することです。モデルを使用して翻訳用の`pipeline`をインスタンス化し、テキストをそれに渡します。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> translator = pipeline("translation", model="my_awesome_opus_books_model")
+>>> translator(text)
+[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。
+
+<frameworkcontent>
+<pt>
+
+テキストをトークン化し、`input_ids` を PyTorch テンソルとして返します。
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="pt").input_ids
+```
+
+[`~transformers.generation_utils.GenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+
+
+```py
+>>> from transformers import AutoModelForSeq2SeqLM
+
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
+```
+
+</pt>
+<tf>
+
+`input_ids`を TensorFlow テンソルとして返します。 tensors:
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model")
+>>> inputs = tokenizer(text, return_tensors="tf").input_ids
+```
+
+[`~transformers.generation_tf_utils.TFGenerationMixin.generate`] メソッドを使用して翻訳を作成します。さまざまなテキスト生成戦略と生成を制御するためのパラメーターの詳細については、[Text Generation](../main_classes/text_generation) API を確認してください。
+
+```py
+>>> from transformers import TFAutoModelForSeq2SeqLM
+
+>>> model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model")
+>>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
+```
+
+生成されたトークン ID をデコードしてテキストに戻します。
+
+```py
+>>> tokenizer.decode(outputs[0], skip_special_tokens=True)
+'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
+```
+</tf>
+</frameworkcontent>
diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md
new file mode 100644
index 00000000000000..ae49875b714335
--- /dev/null
+++ b/docs/source/ja/tasks/video_classification.md
@@ -0,0 +1,503 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Video classification
+
+[[open-in-colab]]
+
+
+ビデオ分類は、ビデオ全体にラベルまたはクラスを割り当てるタスクです。ビデオには、各ビデオに 1 つのクラスのみが含まれることが期待されます。ビデオ分類モデルはビデオを入力として受け取り、ビデオがどのクラスに属するかについての予測を返します。これらのモデルを使用して、ビデオの内容を分類できます。ビデオ分類の実際のアプリケーションはアクション/アクティビティ認識であり、フィットネス アプリケーションに役立ちます。また、視覚障害のある人にとって、特に通勤時に役立ちます。
+
+このガイドでは、次の方法を説明します。
+
+1. [UCF101](https://www.crcv.ucf.edu/) のサブセットで [VideoMAE](https://huggingface.co/docs/transformers/main/en/model_doc/videomae) を微調整します。 data/UCF101.php) データセット。
+2. 微調整したモデルを推論に使用します。
+
+<Tip>
+このチュートリアルで説明するタスクは、次のモデル アーキテクチャでサポートされています。
+
+<!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
+
+[TimeSformer](../model_doc/timesformer), [VideoMAE](../model_doc/videomae), [ViViT](../model_doc/vivit)
+
+<!--End of the generated tip-->
+
+</Tip>
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+```bash
+pip install -q pytorchvideo transformers evaluate
+```
+
+[PyTorchVideo](https://pytorchvideo.org/) (`pytorchvideo` と呼ばれます) を使用してビデオを処理し、準備します。
+
+モデルをアップロードしてコミュニティと共有できるように、Hugging Face アカウントにログインすることをお勧めします。プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Load UCF101 dataset
+
+まず、[UCF-101 データセット](https://www.crcv.ucf.edu/data/UCF101.php) のサブセットをロードします。これにより、完全なデータセットのトレーニングにさらに時間を費やす前に、実験してすべてが機能することを確認する機会が得られます。
+
+```py
+>>> from huggingface_hub import hf_hub_download
+
+>>> hf_dataset_identifier = "sayakpaul/ucf101-subset"
+>>> filename = "UCF101_subset.tar.gz"
+>>> file_path = hf_hub_download(repo_id=hf_dataset_identifier, filename=filename, repo_type="dataset")
+```
+
+サブセットをダウンロードした後、圧縮アーカイブを抽出する必要があります。
+
+```py 
+>>> import tarfile
+
+>>> with tarfile.open(file_path) as t:
+...      t.extractall(".")
+```
+
+大まかに言うと、データセットは次のように構成されています。
+
+```bash
+UCF101_subset/
+    train/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    val/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+    test/
+        BandMarching/
+            video_1.mp4
+            video_2.mp4
+            ...
+        Archery
+            video_1.mp4
+            video_2.mp4
+            ...
+        ...
+```
+
+(`sorted`)された ビデオ パスは次のように表示されます。
+
+
+```bash
+...
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c04.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g07_c06.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g08_c01.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c02.avi',
+'UCF101_subset/train/ApplyEyeMakeup/v_ApplyEyeMakeup_g09_c06.avi'
+...
+```
+
+同じグループ/シーンに属するビデオ クリップがあり、ビデオ ファイル パスではグループが`g`で示されていることがわかります。たとえば、`v_ApplyEyeMakeup_g07_c04.avi`や`v_ApplyEyeMakeup_g07_c06.avi`などです。
+
+検証と評価の分割では、[データ漏洩](https://www.kaggle.com/code/alexisbcook/data-leakage) を防ぐために、同じグループ/シーンからのビデオ クリップを使用しないでください。このチュートリアルで使用しているサブセットでは、この情報が考慮されています。
+
+次に、データセット内に存在するラベルのセットを取得します。また、モデルを初期化するときに役立つ 2 つの辞書を作成します。
+
+* `label2id`: クラス名を整数にマップします。
+* `id2label`: 整数をクラス名にマッピングします。
+
+
+```py 
+>>> class_labels = sorted({str(path).split("/")[2] for path in all_video_file_paths})
+>>> label2id = {label: i for i, label in enumerate(class_labels)}
+>>> id2label = {i: label for label, i in label2id.items()}
+
+>>> print(f"Unique classes: {list(label2id.keys())}.")
+
+# Unique classes: ['ApplyEyeMakeup', 'ApplyLipstick', 'Archery', 'BabyCrawling', 'BalanceBeam', 'BandMarching', 'BaseballPitch', 'Basketball', 'BasketballDunk', 'BenchPress'].
+```
+
+個性的なクラスが10種類あります。トレーニング セットには、クラスごとに 30 個のビデオがあります。
+
+## Load a model to fine-tune
+
+事前トレーニングされたチェックポイントとそれに関連する画像プロセッサからビデオ分類モデルをインスタンス化します。モデルのエンコーダーには事前トレーニングされたパラメーターが付属しており、分類ヘッドはランダムに初期化されます。画像プロセッサは、データセットの前処理パイプラインを作成するときに役立ちます。
+
+```py 
+>>> from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
+
+>>> model_ckpt = "MCG-NJU/videomae-base"
+>>> image_processor = VideoMAEImageProcessor.from_pretrained(model_ckpt)
+>>> model = VideoMAEForVideoClassification.from_pretrained(
+...     model_ckpt,
+...     label2id=label2id,
+...     id2label=id2label,
+...     ignore_mismatched_sizes=True,  # provide this in case you're planning to fine-tune an already fine-tuned checkpoint
+... )
+```
+
+モデルのロード中に、次の警告が表示される場合があります。
+
+```bash
+Some weights of the model checkpoint at MCG-NJU/videomae-base were not used when initializing VideoMAEForVideoClassification: [..., 'decoder.decoder_layers.1.attention.output.dense.bias', 'decoder.decoder_layers.2.attention.attention.key.weight']
+- This IS expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
+- This IS NOT expected if you are initializing VideoMAEForVideoClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
+Some weights of VideoMAEForVideoClassification were not initialized from the model checkpoint at MCG-NJU/videomae-base and are newly initialized: ['classifier.bias', 'classifier.weight']
+You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
+```
+
+この警告は、一部の重み (たとえば、`classifier`層の重みとバイアス) を破棄し、他のいくつかの重み (新しい`classifier`層の重みとバイアス) をランダムに初期化していることを示しています。この場合、これは予想されることです。事前にトレーニングされた重みを持たない新しい頭部を追加しているため、推論に使用する前にこのモデルを微調整する必要があるとライブラリが警告します。これはまさに私たちが行おうとしているものです。する。
+
+**注意** [このチェックポイント](https://huggingface.co/MCG-NJU/videomae-base-finetuned-kinetics) は、同様のダウンストリームで微調整されてチェックポイントが取得されたため、このタスクのパフォーマンスが向上することに注意してください。かなりのドメインの重複があるタスク。 `MCG-NJU/videomae-base-finetuned-kinetics` を微調整して取得した [このチェックポイント](https://huggingface.co/sayakpaul/videomae-base-finetuned-kinetics-finetuned-ucf101-subset) を確認できます。 -キネティクス`。
+
+## Prepare the datasets for training
+
+ビデオの前処理には、[PyTorchVideo ライブラリ](https://pytorchvideo.org/) を利用します。まず、必要な依存関係をインポートします。
+
+
+```py 
+>>> import pytorchvideo.data
+
+>>> from pytorchvideo.transforms import (
+...     ApplyTransformToKey,
+...     Normalize,
+...     RandomShortSideScale,
+...     RemoveKey,
+...     ShortSideScale,
+...     UniformTemporalSubsample,
+... )
+
+>>> from torchvision.transforms import (
+...     Compose,
+...     Lambda,
+...     RandomCrop,
+...     RandomHorizontalFlip,
+...     Resize,
+... )
+```
+
+トレーニング データセットの変換には、均一な時間サブサンプリング、ピクセル正規化、ランダム クロッピング、およびランダムな水平反転を組み合わせて使用​​します。検証および評価のデータセット変換では、ランダムなトリミングと水平反転を除き、同じ変換チェーンを維持します。これらの変換の詳細については、[PyTorchVideo の公式ドキュメント](https://pytorchvideo.org) を参照してください。
+
+事前トレーニングされたモデルに関連付けられた`image_processor`を使用して、次の情報を取得します。
+
+* ビデオ フレームのピクセルが正規化される画像の平均値と標準偏差。
+* ビデオ フレームのサイズが変更される空間解像度。
+
+まず、いくつかの定数を定義します。
+
+```py
+>>> mean = image_processor.image_mean
+>>> std = image_processor.image_std
+>>> if "shortest_edge" in image_processor.size:
+...     height = width = image_processor.size["shortest_edge"]
+>>> else:
+...     height = image_processor.size["height"]
+...     width = image_processor.size["width"]
+>>> resize_to = (height, width)
+
+>>> num_frames_to_sample = model.config.num_frames
+>>> sample_rate = 4
+>>> fps = 30
+>>> clip_duration = num_frames_to_sample * sample_rate / fps
+```
+
+次に、データセット固有の変換とデータセットをそれぞれ定義します。トレーニングセットから始めます:
+
+
+```py 
+>>> train_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     RandomShortSideScale(min_size=256, max_size=320),
+...                     RandomCrop(resize_to),
+...                     RandomHorizontalFlip(p=0.5),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> train_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "train"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration),
+...     decode_audio=False,
+...     transform=train_transform,
+... )
+```
+
+同じ一連のワークフローを検証セットと評価セットに適用できます。
+
+
+```py 
+>>> val_transform = Compose(
+...     [
+...         ApplyTransformToKey(
+...             key="video",
+...             transform=Compose(
+...                 [
+...                     UniformTemporalSubsample(num_frames_to_sample),
+...                     Lambda(lambda x: x / 255.0),
+...                     Normalize(mean, std),
+...                     Resize(resize_to),
+...                 ]
+...             ),
+...         ),
+...     ]
+... )
+
+>>> val_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "val"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+
+>>> test_dataset = pytorchvideo.data.Ucf101(
+...     data_path=os.path.join(dataset_root_path, "test"),
+...     clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration),
+...     decode_audio=False,
+...     transform=val_transform,
+... )
+```
+
+**注意**: 上記のデータセット パイプラインは、[公式 PyTorchVideo サンプル](https://pytorchvideo.org/docs/tutorial_classification#dataset) から取得したものです。 [`pytorchvideo.data.Ucf101()`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.Ucf101) 関数を使用しています。 UCF-101 データセット。内部では、[`pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset`](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html#pytorchvideo.data.LabeledVideoDataset) オブジェクトを返します。 `LabeledVideoDataset` クラスは、PyTorchVideo データセット内のすべてのビデオの基本クラスです。したがって、PyTorchVideo で既製でサポートされていないカスタム データセットを使用したい場合は、それに応じて `LabeledVideoDataset` クラスを拡張できます。詳細については、`data`API [ドキュメント](https://pytorchvideo.readthedocs.io/en/latest/api/data/data.html)を参照してください。また、データセットが同様の構造 (上に示したもの) に従っている場合は、`pytorchvideo.data.Ucf101()` を使用すると問題なく動作するはずです。
+
+`num_videos` 引数にアクセスすると、データセット内のビデオの数を知ることができます。
+
+
+
+```py
+>>> print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos)
+# (300, 30, 75)
+```
+
+## Visualize the preprocessed video for better debugging 
+
+```py 
+>>> import imageio
+>>> import numpy as np
+>>> from IPython.display import Image
+
+>>> def unnormalize_img(img):
+...     """Un-normalizes the image pixels."""
+...     img = (img * std) + mean
+...     img = (img * 255).astype("uint8")
+...     return img.clip(0, 255)
+
+>>> def create_gif(video_tensor, filename="sample.gif"):
+...     """Prepares a GIF from a video tensor.
+...     
+...     The video tensor is expected to have the following shape:
+...     (num_frames, num_channels, height, width).
+...     """
+...     frames = []
+...     for video_frame in video_tensor:
+...         frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy())
+...         frames.append(frame_unnormalized)
+...     kargs = {"duration": 0.25}
+...     imageio.mimsave(filename, frames, "GIF", **kargs)
+...     return filename
+
+>>> def display_gif(video_tensor, gif_name="sample.gif"):
+...     """Prepares and displays a GIF from a video tensor."""
+...     video_tensor = video_tensor.permute(1, 0, 2, 3)
+...     gif_filename = create_gif(video_tensor, gif_name)
+...     return Image(filename=gif_filename)
+
+>>> sample_video = next(iter(train_dataset))
+>>> video_tensor = sample_video["video"]
+>>> display_gif(video_tensor)
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif.gif" alt="Person playing basketball"/>
+</div>
+
+## Train the model 
+
+🤗 Transformers の [`Trainer`](https://huggingface.co/docs/transformers/main_classes/trainer) をモデルのトレーニングに利用します。 `Trainer`をインスタンス化するには、トレーニング構成と評価メトリクスを定義する必要があります。最も重要なのは [`TrainingArguments`](https://huggingface.co/transformers/main_classes/trainer.html#transformers.TrainingArguments) で、これはトレーニングを構成するためのすべての属性を含むクラスです。モデルのチェックポイントを保存するために使用される出力フォルダー名が必要です。また、🤗 Hub 上のモデル リポジトリ内のすべての情報を同期するのにも役立ちます。
+
+トレーニング引数のほとんどは一目瞭然ですが、ここで非常に重要なのは`remove_unused_columns=False`です。これにより、モデルの呼び出し関数で使用されない機能が削除されます。デフォルトでは`True`です。これは、通常、未使用の特徴列を削除し、モデルの呼び出し関数への入力を解凍しやすくすることが理想的であるためです。ただし、この場合、`pixel_values` (モデルが入力で期待する必須キーです) を作成するには、未使用の機能 (特に`video`) が必要です。
+
+```py 
+>>> from transformers import TrainingArguments, Trainer
+
+>>> model_name = model_ckpt.split("/")[-1]
+>>> new_model_name = f"{model_name}-finetuned-ucf101-subset"
+>>> num_epochs = 4
+
+>>> args = TrainingArguments(
+...     new_model_name,
+...     remove_unused_columns=False,
+...     evaluation_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=5e-5,
+...     per_device_train_batch_size=batch_size,
+...     per_device_eval_batch_size=batch_size,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+...     max_steps=(train_dataset.num_videos // batch_size) * num_epochs,
+... )
+```
+
+`pytorchvideo.data.Ucf101()` によって返されるデータセットは `__len__` メソッドを実装していません。そのため、`TrainingArguments`をインスタンス化するときに`max_steps`を定義する必要があります。
+
+次に、予測からメトリクスを計算する関数を定義する必要があります。これは、これからロードする`metric`を使用します。必要な前処理は、予測されたロジットの argmax を取得することだけです。
+
+```py
+import evaluate
+
+metric = evaluate.load("accuracy")
+
+
+def compute_metrics(eval_pred):
+    predictions = np.argmax(eval_pred.predictions, axis=1)
+    return metric.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+**評価に関する注意事項**:
+
+[VideoMAE 論文](https://arxiv.org/abs/2203.12602) では、著者は次の評価戦略を使用しています。彼らはテスト ビデオからのいくつかのクリップでモデルを評価し、それらのクリップにさまざまなクロップを適用して、合計スコアを報告します。ただし、単純さと簡潔さを保つために、このチュートリアルではそれを考慮しません。
+
+また、サンプルをまとめてバッチ処理するために使用される `collat​​e_fn` を定義します。各バッチは、`pixel_values` と `labels` という 2 つのキーで構成されます。
+
+
+```py 
+>>> def collate_fn(examples):
+...     # permute to (num_frames, num_channels, height, width)
+...     pixel_values = torch.stack(
+...         [example["video"].permute(1, 0, 2, 3) for example in examples]
+...     )
+...     labels = torch.tensor([example["label"] for example in examples])
+...     return {"pixel_values": pixel_values, "labels": labels}
+```
+
+次に、これらすべてをデータセットとともに`Trainer`に渡すだけです。
+
+```py 
+>>> trainer = Trainer(
+...     model,
+...     args,
+...     train_dataset=train_dataset,
+...     eval_dataset=val_dataset,
+...     tokenizer=image_processor,
+...     compute_metrics=compute_metrics,
+...     data_collator=collate_fn,
+... )
+```
+
+すでにデータを前処理しているのに、なぜトークナイザーとして`image_processor`を渡したのか不思議に思うかもしれません。これは、イメージ プロセッサ構成ファイル (JSON として保存) もハブ上のリポジトリにアップロードされるようにするためだけです。
+
+次に、`train` メソッドを呼び出してモデルを微調整します。
+
+```py 
+>>> train_results = trainer.train()
+```
+
+トレーニングが完了したら、 [`~transformers.Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、誰もがモデルを使用できるようにします。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+モデルを微調整したので、それを推論に使用できるようになりました。
+
+推論のためにビデオをロードします。
+
+```py 
+>>> sample_test_video = next(iter(test_dataset))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sample_gif_two.gif" alt="Teams playing basketball"/>
+</div>
+
+推論用に微調整されたモデルを試す最も簡単な方法は、それを [`pipeline`](https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.VideoClassificationPipeline). で使用することです。モデルを使用してビデオ分類用の` pipeline`をインスタンス化し、それにビデオを渡します。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> video_cls = pipeline(model="my_awesome_video_cls_model")
+>>> video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi")
+[{'score': 0.9272987842559814, 'label': 'BasketballDunk'},
+ {'score': 0.017777055501937866, 'label': 'BabyCrawling'},
+ {'score': 0.01663011871278286, 'label': 'BalanceBeam'},
+ {'score': 0.009560945443809032, 'label': 'BandMarching'},
+ {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}]
+```
+
+必要に応じて、`pipeline`の結果を手動で複製することもできます。
+
+```py
+>>> def run_inference(model, video):
+...     # (num_frames, num_channels, height, width)
+...     perumuted_sample_test_video = video.permute(1, 0, 2, 3)
+...     inputs = {
+...         "pixel_values": perumuted_sample_test_video.unsqueeze(0),
+...         "labels": torch.tensor(
+...             [sample_test_video["label"]]
+...         ),  # this can be skipped if you don't have labels available.
+...     }
+
+...     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+...     inputs = {k: v.to(device) for k, v in inputs.items()}
+...     model = model.to(device)
+
+...     # forward pass
+...     with torch.no_grad():
+...         outputs = model(**inputs)
+...         logits = outputs.logits
+
+...     return logits
+```
+
+次に、入力をモデルに渡し、`logits `を返します。
+
+```
+>>> logits = run_inference(trained_model, sample_test_video["video"])
+```
+
+`logits` をデコードすると、次のようになります。
+
+```py 
+>>> predicted_class_idx = logits.argmax(-1).item()
+>>> print("Predicted class:", model.config.id2label[predicted_class_idx])
+# Predicted class: BasketballDunk
+```
diff --git a/docs/source/ja/tasks/visual_question_answering.md b/docs/source/ja/tasks/visual_question_answering.md
new file mode 100644
index 00000000000000..f6c2989693708b
--- /dev/null
+++ b/docs/source/ja/tasks/visual_question_answering.md
@@ -0,0 +1,405 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Visual Question Answering
+
+[[open-in-colab]]
+
+Visual Question Answering (VQA) は、画像に基づいて自由形式の質問に答えるタスクです。
+このタスクをサポートするモデルへの入力は通常、画像と質問の組み合わせであり、出力は
+自然言語で表現された答え。
+
+VQA の注目すべき使用例には次のようなものがあります。
+* 視覚障害者向けのアクセシビリティ アプリケーション。
+* 教育: 講義や教科書で示されている視覚的な資料について質問を投げかけること。 VQA は、インタラクティブな博物館の展示物や史跡でも利用できます。
+* カスタマー サービスと電子商取引: VQA は、ユーザーが製品について質問できるようにすることでユーザー エクスペリエンスを向上させます。
+* 画像検索: VQA モデルを使用して、特定の特徴を持つ画像を検索できます。たとえば、ユーザーは「犬はいますか?」と尋ねることができます。一連の画像から犬が写っているすべての画像を検索します。
+
+このガイドでは、次の方法を学びます。
+
+- [`Graphcore/vqa` データセット](https://huggingface.co/datasets/Graphcore/vqa) 上で分類 VQA モデル、特に [ViLT](../model_doc/vilt) を微調整します。
+- 微調整された ViLT を推論に使用します。
+- BLIP-2 などの生成モデルを使用してゼロショット VQA 推論を実行します。
+
+## Fine-tuning ViLT
+
+ViLT モデルは、Vision Transformer (ViT) にテキスト埋め込みを組み込んでおり、最小限の設計を可能にします。
+視覚と言語の事前トレーニング (VLP)。このモデルは、いくつかの下流タスクに使用できます。 VQA タスクの場合、分類子
+head は最上部 (`[CLS]` トークンの最終的な非表示状態の最上部にある線形層) に配置され、ランダムに初期化されます。
+したがって、視覚的質問応答は **分類問題** として扱われます。
+
+BLIP、BLIP-2、InstructBLIP などの最近のモデルは、VQA を生成タスクとして扱います。このガイドの後半では、
+ゼロショット VQA 推論にそれらを使用する方法を示します。
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q transformers datasets
+```
+モデルをコミュニティと共有することをお勧めします。 Hugging Face アカウントにログインして、🤗 ハブにアップロードします。
+プロンプトが表示されたら、トークンを入力してログインします。
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+モデルのチェックポイントをグローバル変数として定義しましょう。
+
+```py
+>>> model_checkpoint = "dandelin/vilt-b32-mlm"
+```
+
+## Load the data
+
+説明の目的で、このガイドでは、注釈付きの視覚的な質問に答える「Graphcore/vqa」データセットの非常に小さなサンプルを使用します。
+完全なデータセットは [🤗 Hub](https://huggingface.co/datasets/Graphcore/vqa) で見つけることができます。
+
+[`Graphcore/vqa` データセット](https://huggingface.co/datasets/Graphcore/vqa) の代わりに、
+公式 [VQA データセット ページ](https://visualqa.org/download.html) から同じデータを手動で取得します。フォローしたい場合は、
+カスタム データを使用したチュートリアルでは、[画像データセットを作成する](https://huggingface.co/docs/datasets/image_dataset#loading-script) 方法を確認してください。
+🤗 データセットのドキュメントのガイド。
+
+検証分割から最初の 200 個の例をロードし、データセットの機能を調べてみましょう。
+
+```python
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("Graphcore/vqa", split="validation[:200]")
+>>> dataset
+Dataset({
+    features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'],
+    num_rows: 200
+})
+```
+データセットの特徴を理解するために例を見てみましょう。
+
+```py
+>>> dataset[0]
+{'question': 'Where is he looking?',
+ 'question_type': 'none of the above',
+ 'question_id': 262148000,
+ 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg',
+ 'answer_type': 'other',
+ 'label': {'ids': ['at table', 'down', 'skateboard', 'table'],
+  'weights': [0.30000001192092896,
+   1.0,
+   0.30000001192092896,
+   0.30000001192092896]}}
+```
+
+このタスクに関連する機能には次のものがあります。
+* `question`: 画像から回答する質問
+* `image_id`: 質問が参照する画像へのパス
+* `label`: 注釈
+
+残りの機能は必要ないので削除できます。
+
+
+```py 
+>>> dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type'])
+```
+
+ご覧のとおり、`label`機能には、さまざまなヒューマン・アノテーターによって収集された、同じ質問に対する複数の回答 (ここでは`id`と呼びます) が含まれています。
+質問に対する答えは主観的なものになる可能性があるためです。この場合、問題は "彼はどこを見ているのか？"ということです。一部の人々
+これには "ダウン" という注釈が付けられ、他のものには "テーブルで" という注釈が付けられ、別の注釈には "スケートボード" という注釈が付けられました。
+
+画像を見て、どの答えを出すかを考えてください。
+
+```python
+>>> from PIL import Image
+
+>>> image = Image.open(dataset[0]['image_id'])
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/vqa-example.png" alt="VQA Image Example"/>
+</div>
+
+
+質問と回答のあいまいさのため、このようなデータセットはマルチラベル分類問題として扱われます (
+複数の回答が有効である可能性があります)。さらに、ワンホット エンコードされたベクトルを作成するだけではなく、
+注釈内に特定の回答が出現した回数に基づくソフト エンコーディング。
+
+たとえば、上の例では、"down"という回答が他の回答よりも頻繁に選択されるため、
+スコア (データセットでは`weight`と呼ばれます) は 1.0 で、残りの回答のスコアは 1.0 未満です。
+
+後で適切な分類ヘッドを使用してモデルをインスタンス化するために、2 つの辞書を作成しましょう。
+ラベル名を整数に変換する、またはその逆:
+
+```py
+>>> import itertools
+
+>>> labels = [item['ids'] for item in dataset['label']]
+>>> flattened_labels = list(itertools.chain(*labels))
+>>> unique_labels = list(set(flattened_labels))
+
+>>> label2id = {label: idx for idx, label in enumerate(unique_labels)}
+>>> id2label = {idx: label for label, idx in label2id.items()} 
+```
+
+マッピングができたので、文字列の回答をその ID に置き換え、さらに前処理をより便利にするためにデータセットをフラット化することができます。
+
+```python
+>>> def replace_ids(inputs):
+...   inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]]
+...   return inputs
+
+
+>>> dataset = dataset.map(replace_ids)
+>>> flat_dataset = dataset.flatten()
+>>> flat_dataset.features
+{'question': Value(dtype='string', id=None),
+ 'image_id': Value(dtype='string', id=None),
+ 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
+ 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)}
+```
+
+## Preprocessing data
+
+次のステップでは、ViLT プロセッサをロードして、モデルの画像データとテキスト データを準備します。
+[`ViltProcessor`] は、BERT トークナイザーと ViLT 画像プロセッサを便利な単一プロセッサにラップします。
+
+```py 
+>>> from transformers import ViltProcessor
+
+>>> processor = ViltProcessor.from_pretrained(model_checkpoint)
+```
+
+データを前処理するには、[`ViltProcessor`] を使用して画像と質問をエンコードする必要があります。プロセッサーは使用します
+[`BertTokenizerFast`] を使用してテキストをトークン化し、テキスト データの `input_ids`、`attention_mask`、および `token_type_ids` を作成します。
+画像に関しては、プロセッサは [`ViltImageProcessor`] を利用して画像のサイズ変更と正規化を行い、`pixel_values` と `pixel_mask` を作成します。
+
+これらの前処理ステップはすべて内部で行われ、`processor`を呼び出すだけで済みます。ただし、それでも必要なのは、
+対象のラベルを準備します。この表現では、各要素は考えられる答え (ラベル) に対応します。正解の場合、要素は保持されます。
+それぞれのスコア (重み) が設定され、残りの要素は 0 に設定されます。
+
+次の関数は、画像と質問に `processor` を適用し、上で説明したようにラベルをフォーマットします。
+
+```py
+>>> import torch
+
+>>> def preprocess_data(examples):
+...     image_paths = examples['image_id']
+...     images = [Image.open(image_path) for image_path in image_paths]
+...     texts = examples['question']    
+
+...     encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt")
+
+...     for k, v in encoding.items():
+...           encoding[k] = v.squeeze()
+    
+...     targets = []
+
+...     for labels, scores in zip(examples['label.ids'], examples['label.weights']):
+...         target = torch.zeros(len(id2label))
+
+...         for label, score in zip(labels, scores):
+...             target[label] = score
+      
+...         targets.append(target)
+
+...     encoding["labels"] = targets
+    
+...     return encoding
+```
+
+データセット全体に前処理関数を適用するには、🤗 Datasets [`~datasets.map`] 関数を使用します。 `map` を高速化するには、次のようにします。
+データセットの複数の要素を一度に処理するには、`batched=True` を設定します。この時点で、不要な列は自由に削除してください。
+
+
+```py
+>>> processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type',  'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights'])
+>>> processed_dataset
+Dataset({
+    features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'],
+    num_rows: 200
+})
+```
+
+最後のステップとして、[`DefaultDataCollat​​or`] を使用してサンプルのバッチを作成します。
+
+
+```py
+>>> from transformers import DefaultDataCollator
+
+>>> data_collator = DefaultDataCollator()
+```
+
+## Train the model
+
+これでモデルのトレーニングを開始する準備が整いました。 [`ViltForQuestionAnswering`] で ViLT をロードします。ラベルの数を指定します
+ラベルマッピングとともに:
+
+```py
+>>> from transformers import ViltForQuestionAnswering
+
+>>> model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id)
+```
+
+この時点で残っているステップは 3 つだけです。
+
+1. [`TrainingArguments`] でトレーニング ハイパーパラメータを定義します。
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> repo_id = "MariaK/vilt_finetuned_200"
+
+>>> training_args = TrainingArguments(
+...     output_dir=repo_id,
+...     per_device_train_batch_size=4,
+...     num_train_epochs=20,
+...     save_steps=200,
+...     logging_steps=50,
+...     learning_rate=5e-5,
+...     save_total_limit=2,
+...     remove_unused_columns=False,
+...     push_to_hub=True,
+... )
+```
+
+2. トレーニング引数をモデル、データセット、プロセッサー、データ照合器とともに [`Trainer`] に渡します。
+
+```py
+>>> from transformers import Trainer
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     data_collator=data_collator,
+...     train_dataset=processed_dataset,
+...     tokenizer=processor,
+... )
+```
+
+3. [`~Trainer.train`] を呼び出してモデルを微調整します。
+
+```py
+>>> trainer.train() 
+```
+
+トレーニングが完了したら、 [`~Trainer.push_to_hub`] メソッドを使用してモデルをハブに共有し、🤗 ハブで最終モデルを共有します。
+
+```py
+>>> trainer.push_to_hub()
+```
+
+## Inference
+
+ViLT モデルを微調整し、🤗 Hub にアップロードしたので、それを推論に使用できます。もっとも単純な
+推論用に微調整されたモデルを試す方法は、それを [`pipeline`] で使用することです。
+
+```py
+>>> from transformers import pipeline
+
+>>> pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200")
+```
+
+このガイドのモデルは 200 の例でのみトレーニングされているため、多くを期待しないでください。少なくともそれがあるかどうか見てみましょう
+データから何かを学習し、推論を説明するためにデータセットから最初の例を取り出します。
+
+```py
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+>>> print(question)
+>>> pipe(image, question, top_k=1)
+"Where is he looking?"
+[{'score': 0.5498199462890625, 'answer': 'down'}]
+```
+
+あまり自信がありませんが、モデルは確かに何かを学習しました。より多くの例とより長いトレーニングを行うと、はるかに良い結果が得られます。
+
+必要に応じて、パイプラインの結果を手動で複製することもできます。
+1. 画像と質問を取得し、モデルのプロセッサを使用してモデル用に準備します。
+2. モデルを通じて結果または前処理を転送します。
+3. ロジットから、最も可能性の高い回答の ID を取得し、`id2label` で実際の回答を見つけます。
+
+```py
+>>> processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+
+>>> # prepare inputs
+>>> inputs = processor(image, question, return_tensors="pt")
+
+>>> model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200")
+
+>>> # forward pass
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits
+>>> idx = logits.argmax(-1).item()
+>>> print("Predicted answer:", model.config.id2label[idx])
+Predicted answer: down
+```
+
+## Zero-shot VQA
+
+以前のモデルでは、VQA を分類タスクとして扱いました。 BLIP、BLIP-2、InstructBLIP アプローチなどの一部の最近のモデル
+生成タスクとしての VQA。 [BLIP-2](../model_doc/blip-2) を例として考えてみましょう。新しいビジュアル言語の事前トレーニングを導入しました
+事前にトレーニングされたビジョン エンコーダーと LLM を任意に組み合わせて使用​​できるパラダイム (詳細については、[BLIP-2 ブログ投稿](https://huggingface.co/blog/blip-2) を参照)。
+これにより、視覚的な質問応答を含む複数の視覚言語タスクで最先端の結果を達成することができます。
+
+このモデルを VQA に使用する方法を説明しましょう。まず、モデルをロードしましょう。ここではモデルを明示的に送信します。
+GPU (利用可能な場合)。これは [`Trainer`] が自動的に処理するため、トレーニング時に事前に行う必要はありませんでした。
+
+
+```py
+>>> from transformers import AutoProcessor, Blip2ForConditionalGeneration
+>>> import torch
+
+>>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b")
+>>> model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16)
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model.to(device)
+```
+
+モデルは画像とテキストを入力として受け取るため、VQA データセットの最初の例とまったく同じ画像と質問のペアを使用してみましょう。
+
+
+```py 
+>>> example = dataset[0]
+>>> image = Image.open(example['image_id'])
+>>> question = example['question']
+```
+
+視覚的な質問応答タスクに BLIP-2 を使用するには、テキスト プロンプトが特定の形式 (`Question: {} Answer:`) に従う必要があります。
+
+
+```py
+>>> prompt = f"Question: {question} Answer:" 
+```
+
+次に、モデルのプロセッサで画像/プロンプトを前処理し、処理された入力をモデルに渡し、出力をデコードする必要があります。
+
+```py
+>>> inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16)
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+>>> print(generated_text)
+"He is looking at the crowd" 
+```
+
+ご覧のとおり、モデルは群衆と顔の向き (下を向いている) を認識しましたが、見逃しているようです。
+観客がスケーターの後ろにいるという事実。それでも、人間が注釈を付けたデータセットを取得することが不可能な場合には、これは
+このアプローチにより、有用な結果がすぐに得られます。
diff --git a/docs/source/ja/tasks/zero_shot_image_classification.md b/docs/source/ja/tasks/zero_shot_image_classification.md
new file mode 100644
index 00000000000000..d8d0b530334ec4
--- /dev/null
+++ b/docs/source/ja/tasks/zero_shot_image_classification.md
@@ -0,0 +1,148 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Zero-shot image classification
+
+[[open-in-colab]]
+
+ゼロショット画像分類は、次のモデルを使用して画像をさまざまなカテゴリに分類するタスクです。
+これらの特定のカテゴリのラベル付きの例を含むデータに対して明示的にトレーニングされていない。
+
+従来、画像分類には、ラベル付き画像の特定のセットでモデルをトレーニングする必要があり、このモデルは次のことを学習します。
+特定の画像の特徴をラベルに「マッピング」します。分類タスクにそのようなモデルを使用する必要がある場合、
+新しいラベルのセットでは、モデルを "再調整" するために微調整が必​​要です。
+
+対照的に、ゼロショットまたはオープン語彙画像分類モデルは、通常、大規模なシステムでトレーニングされたマルチモーダル モデルです。
+画像と関連する説明のデータセット。これらのモデルは、ゼロショット画像分類を含む多くの下流タスクに使用できる、調整された視覚言語表現を学習します。
+
+これは、画像分類に対するより柔軟なアプローチであり、モデルを新しいまだ見たことのないカテゴリに一般化できるようになります。
+追加のトレーニング データを必要とせず、ユーザーはターゲット オブジェクトの自由形式のテキスト説明を含む画像をクエリできるようになります。
+
+このガイドでは、次の方法を学びます。
+
+* ゼロショット画像分類パイプラインを作成する
+* 手動でゼロショット画像分類推論を実行します
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q transformers
+```
+
+## Zero-shot image classification pipeline
+
+ゼロショット画像分類をサポートするモデルで推論を試す最も簡単な方法は、対応する [`パイプライン`] を使用することです。
+[Hugging Face Hub のチェックポイント](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads) からパイプラインをインスタンス化します。
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "openai/clip-vit-large-patch14"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-image-classification")
+```
+
+次に、分類したい画像を選択します。
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/owl.jpg" alt="Photo of an owl"/>
+</div>
+
+画像と候補オブジェクトのラベルをパイプラインに渡します。ここでは画像を直接渡します。他の適切なオプション
+画像へのローカル パスまたは画像 URL を含めます。
+候補ラベルは、この例のように単純な単語にすることも、より説明的な単語にすることもできます。
+
+```py
+>>> predictions = detector(image, candidate_labels=["fox", "bear", "seagull", "owl"])
+>>> predictions
+[{'score': 0.9996670484542847, 'label': 'owl'},
+ {'score': 0.000199399160919711, 'label': 'seagull'},
+ {'score': 7.392891711788252e-05, 'label': 'fox'},
+ {'score': 5.96074532950297e-05, 'label': 'bear'}]
+```
+
+## Zero-shot image classification by hand
+
+ゼロショット画像分類パイプラインの使用方法を理解したところで、ゼロショットを実行する方法を見てみましょう。
+画像を手動で分類します。
+
+まず、[Hugging Face Hub のチェックポイント](https://huggingface.co/models?pipeline_tag=zero-shot-image-classification&sort=downloads) からモデルと関連プロセッサをロードします。
+ここでは、前と同じチェックポイントを使用します。
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotImageClassification
+
+>>> model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+気分を変えて、別の画像を撮ってみましょう。
+
+```py
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg" alt="Photo of a car"/>
+</div>
+
+プロセッサを使用してモデルの入力を準備します。プロセッサーは、
+サイズ変更と正規化によるモデルの画像、およびテキスト入力を処理するトークナイザー。
+
+```py
+>>> candidate_labels = ["tree", "car", "bike", "cat"]
+>>> inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True)
+```
+
+入力をモデルに渡し、結果を後処理します。
+
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits = outputs.logits_per_image[0]
+>>> probs = logits.softmax(dim=-1).numpy()
+>>> scores = probs.tolist()
+
+>>> result = [
+...     {"score": score, "label": candidate_label}
+...     for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0])
+... ]
+
+>>> result
+[{'score': 0.998572, 'label': 'car'},
+ {'score': 0.0010570387, 'label': 'bike'},
+ {'score': 0.0003393686, 'label': 'tree'},
+ {'score': 3.1572064e-05, 'label': 'cat'}]
+```
diff --git a/docs/source/ja/tasks/zero_shot_object_detection.md b/docs/source/ja/tasks/zero_shot_object_detection.md
new file mode 100644
index 00000000000000..7dbc97ef3a1842
--- /dev/null
+++ b/docs/source/ja/tasks/zero_shot_object_detection.md
@@ -0,0 +1,310 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Zero-shot object detection
+
+[[open-in-colab]]
+
+従来、[オブジェクト検出](object_detection) に使用されるモデルには、トレーニング用のラベル付き画像データセットが必要でした。
+トレーニング データからのクラスのセットの検出に限定されます。
+
+ゼロショットオブジェクト検出は、別のアプローチを使用する [OWL-ViT](../model_doc/owlvit) モデルによってサポートされています。 OWL-ViT
+オープン語彙オブジェクト検出器です。これは、フリーテキストクエリに基づいて画像内のオブジェクトを検出できることを意味します。
+ラベル付きデータセットでモデルを微調整する必要性。
+
+OWL-ViTは、マルチモーダル表現を利用してオープン語彙の検出を実行します。 [CLIP](../model_doc/clip) とを組み合わせます。
+軽量のオブジェクト分類および位置特定ヘッド。オープン語彙の検出は、CLIP のテキスト エンコーダーを使用してフリーテキスト クエリを埋め込み、それらをオブジェクト分類およびローカリゼーション ヘッドへの入力として使用することによって実現されます。
+画像とそれに対応するテキストの説明を関連付け、ViT は画像パッチを入力として処理します。作家たち
+のOWL-ViTは、まずCLIPをゼロからトレーニングし、次に標準の物体検出データセットを使用してOWL-ViTをエンドツーエンドで微調整しました。
+二部マッチング損失。
+
+このアプローチを使用すると、モデルはラベル付きデータセットで事前にトレーニングしなくても、テキストの説明に基づいてオブジェクトを検出できます。
+
+このガイドでは、OWL-ViT の使用方法を学習します。
+- テキストプロンプトに基づいてオブジェクトを検出します
+- バッチオブジェクト検出用
+- 画像誘導物体検出用
+
+始める前に、必要なライブラリがすべてインストールされていることを確認してください。
+
+```bash
+pip install -q transformers
+```
+
+## Zero-shot object detection pipeline
+
+OWL-ViTによる推論を試す最も簡単な方法は、OWL-ViTを[`pipeline`]で使用することです。パイプラインをインスタンス化する
+[Hugging Face Hub のチェックポイント](https://huggingface.co/models?other=owlvit) からのゼロショット オブジェクト検出の場合:
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "google/owlvit-base-patch32"
+>>> detector = pipeline(model=checkpoint, task="zero-shot-object-detection")
+```
+
+次に、物体を検出したい画像を選択します。ここでは、宇宙飛行士アイリーン・コリンズの画像を使用します。
+[NASA](https://www.nasa.gov/multimedia/imagegallery/index.html) Great Images データセットの一部。
+
+```py
+>>> import skimage
+>>> import numpy as np
+>>> from PIL import Image
+
+>>> image = skimage.data.astronaut()
+>>> image = Image.fromarray(np.uint8(image)).convert("RGB")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_1.png" alt="Astronaut Eileen Collins"/>
+</div>
+
+検索する画像と候補オブジェクトのラベルをパイプラインに渡します。
+ここでは画像を直接渡します。他の適切なオプションには、画像へのローカル パスまたは画像 URL が含まれます。また、画像をクエリするすべてのアイテムのテキスト説明も渡します。
+
+```py
+>>> predictions = detector(
+...     image,
+...     candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"],
+... )
+>>> predictions
+[{'score': 0.3571370542049408,
+  'label': 'human face',
+  'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}},
+ {'score': 0.28099656105041504,
+  'label': 'nasa badge',
+  'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}},
+ {'score': 0.2110239565372467,
+  'label': 'rocket',
+  'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}},
+ {'score': 0.13790413737297058,
+  'label': 'star-spangled banner',
+  'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}},
+ {'score': 0.11950037628412247,
+  'label': 'nasa badge',
+  'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}},
+ {'score': 0.10649408400058746,
+  'label': 'rocket',
+  'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}]
+```
+
+予測を視覚化してみましょう。
+
+
+```py
+>>> from PIL import ImageDraw
+
+>>> draw = ImageDraw.Draw(image)
+
+>>> for prediction in predictions:
+...     box = prediction["box"]
+...     label = prediction["label"]
+...     score = prediction["score"]
+
+...     xmin, ymin, xmax, ymax = box.values()
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white")
+
+>>> image
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_2.png" alt="Visualized predictions on NASA image"/>
+</div>
+
+## Text-prompted zero-shot object detection by hand
+
+ゼロショット物体検出パイプラインの使用方法を確認したので、同じことを再現してみましょう。
+手動で結果を取得します。
+
+まず、[Hugging Face Hub のチェックポイント](https://huggingface.co/models?other=owlvit) からモデルと関連プロセッサをロードします。
+ここでは、前と同じチェックポイントを使用します。
+
+```py
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+```
+
+気分を変えて、別の画像を撮ってみましょう。
+
+```py
+>>> import requests
+
+>>> url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640"
+>>> im = Image.open(requests.get(url, stream=True).raw)
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_3.png" alt="Beach photo"/>
+</div>
+
+プロセッサを使用してモデルの入力を準備します。プロセッサーは、
+サイズ変更と正規化によるモデルの画像と、テキスト入力を処理する [`CLIPTokenizer`] です。
+
+```py
+>>> text_queries = ["hat", "book", "sunglasses", "camera"]
+>>> inputs = processor(text=text_queries, images=im, return_tensors="pt")
+```
+
+入力をモデルに渡し、後処理し、結果を視覚化します。以前は画像プロセッサによって画像のサイズが変更されていたため、
+それらをモデルにフィードするには、[`~OwlViTImageProcessor.post_process_object_detection`] メソッドを使用して、予測された境界を確認する必要があります。
+ボックスは元の画像を基準とした正しい座標を持ちます。
+
+```py
+>>> import torch
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = torch.tensor([im.size[::-1]])
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(im)
+
+>>> scores = results["scores"].tolist()
+>>> labels = results["labels"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white")
+
+>>> im
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## Batch processing
+
+複数の画像セットとテキスト クエリを渡して、複数の画像内の異なる (または同じ) オブジェクトを検索できます。
+宇宙飛行士の画像とビーチの画像を組み合わせてみましょう。
+バッチ処理の場合、テキスト クエリをネストされたリストとしてプロセッサに渡し、画像を PIL イメージのリストとして渡す必要があります。
+PyTorch テンソル、または NumPy 配列。
+
+```py
+>>> images = [image, im]
+>>> text_queries = [
+...     ["human face", "rocket", "nasa badge", "star-spangled banner"],
+...     ["hat", "book", "sunglasses", "camera"],
+... ]
+>>> inputs = processor(text=text_queries, images=images, return_tensors="pt")
+```
+
+以前は後処理のために単一の画像のサイズをテンソルとして渡していましたが、タプルを渡すこともできます。
+複数の画像のタプルのリスト。 2 つの例の予測を作成し、2 番目の例 (`image_idx = 1`) を視覚化しましょう。
+
+```py
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+...     target_sizes = [x.size[::-1] for x in images]
+...     results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)
+
+>>> image_idx = 1
+>>> draw = ImageDraw.Draw(images[image_idx])
+
+>>> scores = results[image_idx]["scores"].tolist()
+>>> labels = results[image_idx]["labels"].tolist()
+>>> boxes = results[image_idx]["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1)
+...     draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white")
+
+>>> images[image_idx]
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_4.png" alt="Beach photo with detected objects"/>
+</div>
+
+## Image-guided object detection
+
+テキストクエリによるゼロショットオブジェクト検出に加えて、OWL-ViTは画像ガイドによるオブジェクト検出を提供します。これはつまり
+画像クエリを使用して、ターゲット画像内の類似したオブジェクトを検索できます。
+テキスト クエリとは異なり、使用できるサンプル画像は 1 つだけです。
+
+対象画像としてソファに2匹の猫がいる画像と、1匹の猫の画像を撮影しましょう
+クエリとして:
+
+```py
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image_target = Image.open(requests.get(url, stream=True).raw)
+
+>>> query_url = "http://images.cocodataset.org/val2017/000000524280.jpg"
+>>> query_image = Image.open(requests.get(query_url, stream=True).raw)
+```
+
+画像を簡単に見てみましょう。
+
+```py
+>>> import matplotlib.pyplot as plt
+
+>>> fig, ax = plt.subplots(1, 2)
+>>> ax[0].imshow(image_target)
+>>> ax[1].imshow(query_image)
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_5.png" alt="Cats"/>
+</div>
+
+前処理ステップでは、テキスト クエリの代わりに `query_images` を使用する必要があります。
+
+```py
+>>> inputs = processor(images=image_target, query_images=query_image, return_tensors="pt")
+```
+
+予測の場合、入力をモデルに渡す代わりに、[`~OwlViTForObjectDetection.image_guided_detection`] に渡します。予測を描く
+ラベルがないことを除いては以前と同様です。
+
+```py
+>>> with torch.no_grad():
+...     outputs = model.image_guided_detection(**inputs)
+...     target_sizes = torch.tensor([image_target.size[::-1]])
+...     results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0]
+
+>>> draw = ImageDraw.Draw(image_target)
+
+>>> scores = results["scores"].tolist()
+>>> boxes = results["boxes"].tolist()
+
+>>> for box, score, label in zip(boxes, scores, labels):
+...     xmin, ymin, xmax, ymax = box
+...     draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4)
+
+>>> image_target
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/zero-sh-obj-detection_6.png" alt="Cats with bounding boxes"/>
+</div>
+
+OWL-ViTによる推論をインタラクティブに試したい場合は、このデモをチェックしてください。
+
+<iframe
+	src="https://adirik-owl-vit.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
\ No newline at end of file
diff --git a/docs/source/ja/training.md b/docs/source/ja/training.md
index 54b34274bf1cf7..4e5dbaa77aefad 100644
--- a/docs/source/ja/training.md
+++ b/docs/source/ja/training.md
@@ -49,7 +49,7 @@ rendered properly in your Markdown viewer.
 ```
 
 トークナイザがテキストを処理し、可変のシーケンス長を処理するためのパディングと切り捨て戦略を含める必要があることをご存知の通り、
-データセットを1つのステップで処理するには、🤗 Datasets の [`map`](https://huggingface.co/docs/datasets/process.html#map) メソッドを使用して、
+データセットを1つのステップで処理するには、🤗 Datasets の [`map`](https://huggingface.co/docs/datasets/process#map) メソッドを使用して、
 データセット全体に前処理関数を適用します：
 
 ```py
diff --git a/docs/source/ko/llm_tutorial.md b/docs/source/ko/llm_tutorial.md
index 05f27dff4f50b2..d5e0bd356edd2e 100644
--- a/docs/source/ko/llm_tutorial.md
+++ b/docs/source/ko/llm_tutorial.md
@@ -74,14 +74,13 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 
 </Tip>
 
-<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
 먼저, 모델을 불러오세요.
 
-```py
+```python
 >>> from transformers import AutoModelForCausalLM
 
 >>> model = AutoModelForCausalLM.from_pretrained(
-...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
 ... )
 ```
 
@@ -94,18 +93,20 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 
 이어서 텍스트 입력을 [토크나이저](tokenizer_summary)으로 전처리하세요.
 
-```py
+```python
 >>> from transformers import AutoTokenizer
+>>> import torch
 
->>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
->>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+>>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to(device)
 ```
 
 `model_inputs` 변수에는 토큰화된 텍스트 입력과 함께 어텐션 마스크가 들어 있습니다. [`~generation.GenerationMixin.generate`]는 어텐션 마스크가 제공되지 않았을 경우에도 이를 추론하려고 노력하지만, 최상의 성능을 위해서는 가능하면 어텐션 마스크를 전달하는 것을 권장합니다. 
 
 마지막으로 [`~generation.GenerationMixin.generate`] 메소드를 호출해 생성된 토큰을 얻은 후, 이를 출력하기 전에 텍스트 형태로 변환하세요.
 
-```py
+```python
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 'A list of colors: red, blue, green, yellow, black, white, and brown'
@@ -121,10 +122,10 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 ```py
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
->>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
->>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Mistral has no pad token by default
 >>> model = AutoModelForCausalLM.from_pretrained(
-...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
+...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
 ... )
 ```
 
@@ -137,12 +138,12 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 >>> model_inputs = tokenizer(["A sequence of numbers: 1, 2"], return_tensors="pt").to("cuda")
 
 >>> # By default, the output will contain up to 20 tokens
->>> generated_ids = model.generate(**model_inputs)
+>>> generated_ids = model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 'A sequence of numbers: 1, 2, 3, 4, 5'
 
 >>> # Setting `max_new_tokens` allows you to control the maximum length
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=50)
+>>> generated_ids = model.generate(**model_inputs, pad_token_id=tokenizer.eos_token_id, max_new_tokens=50)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
 'A sequence of numbers: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,'
 ```
@@ -151,7 +152,7 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 
 기본적으로 [`~generation.GenerationConfig`] 파일에서 별도로 지정하지 않으면, `generate`는 각 반복에서 가장 확률이 높은 토큰을 선택합니다(그리디 디코딩). 하려는 작업에 따라 이 방법은 바람직하지 않을 수 있습니다. 예를 들어, 챗봇이나 에세이 작성과 같은 창의적인 작업은 샘플링이 적합할 수 있습니다. 반면, 오디오를 텍스트로 변환하거나 번역과 같은 입력 기반 작업은 그리디 디코딩이 더 적합할 수 있습니다. `do_sample=True`로 샘플링을 활성화할 수 있으며, 이 주제에 대한 자세한 내용은 이 [블로그 포스트](https://huggingface.co/blog/how-to-generate)에서 볼 수 있습니다.
 
-```py
+```python
 >>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
 >>> from transformers import set_seed
 >>> set_seed(0)
@@ -173,7 +174,7 @@ LLM과 자기회귀 생성을 함께 사용할 때 핵심적인 부분은 이 
 
 LLM은 [디코더 전용](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt) 구조를 가지고 있어, 입력 프롬프트에 대해 지속적으로 반복 처리를 합니다. 입력 데이터의 길이가 다르면 패딩 작업이 필요합니다. LLM은 패딩 토큰에서 작동을 이어가도록 설계되지 않았기 때문에, 입력 왼쪽에 패딩이 추가 되어야 합니다. 그리고 어텐션 마스크도 꼭 `generate` 함수에 전달되어야 합니다!
 
-```py
+```python
 >>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
 >>> # which is shorter, has padding on the right side. Generation fails.
 >>> model_inputs = tokenizer(
diff --git a/docs/source/ko/model_doc/whisper.md b/docs/source/ko/model_doc/whisper.md
index 68fbe045caf62d..f48bae1e60f5d0 100644
--- a/docs/source/ko/model_doc/whisper.md
+++ b/docs/source/ko/model_doc/whisper.md
@@ -33,6 +33,14 @@ Whisper 모델은 Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine
 - 현재 추론은 짧은 형식에만 구현되어 있으며, 오디오는 30초 미만의 세그먼트로 미리 분할되어야 합니다. 타임스탬프를 포함한 긴 형식에 대한 추론은 향후 릴리스에서 구현될 예정입니다.
 - [`WhisperProcessor`]를 사용하여 모델에 사용할 오디오를 준비하고, 예측된 ID를 텍스트로 디코딩할 수 있습니다.
 
+- 모델과 프로세서를 변환하려면 다음을 사용하는 것이 좋습니다:
+
+```bash
+python src/transformers/models/whisper/convert_openai_to_hf.py --checkpoint_path "" --pytorch_dump_folder_path "Arthur/whisper-3" --convert_preprocessor True
+```
+스크립트는 OpenAI 체크포인트에서 필요한 모든 매개변수를 자동으로 결정합니다. OpenAI 변환을 수행하려면 `tiktoken` 라이브러리를 설치해야 합니다.
+라이브러리를 설치해야 OpenAI 토큰화기를 `tokenizers` 버전으로 변환할 수 있습니다.
+
 이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ)에 의해 제공되었습니다. 이 모델의 Tensorflow 버전은 [amyeroberts](https://huggingface.co/amyeroberts)에 의해 제공되었습니다.
 원본 코드는 [여기](https://github.com/openai/whisper)에서 찾을 수 있습니다.
 
diff --git a/docs/source/ko/perf_hardware.md b/docs/source/ko/perf_hardware.md
index e715b39487f37d..bb35e6fae2f282 100644
--- a/docs/source/ko/perf_hardware.md
+++ b/docs/source/ko/perf_hardware.md
@@ -135,7 +135,7 @@ NVLink 사용 시 훈련이 약 23% 더 빠르게 완료됨을 확인할 수 있
 ```bash
 # DDP w/ NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
@@ -144,7 +144,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch
 
 # DDP w/o NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
diff --git a/docs/source/ko/perf_train_gpu_many.md b/docs/source/ko/perf_train_gpu_many.md
index 9d80fd65727df3..706832a8a1dc89 100644
--- a/docs/source/ko/perf_train_gpu_many.md
+++ b/docs/source/ko/perf_train_gpu_many.md
@@ -145,7 +145,7 @@ python examples/pytorch/language-modeling/run_clm.py \
 
 # DDP w/ NVlink
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
@@ -153,7 +153,7 @@ python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-
 
 # DDP w/o NVlink
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
-python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
+torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
 
diff --git a/docs/source/ko/preprocessing.md b/docs/source/ko/preprocessing.md
index 7a9d2987381cd9..e11f68d6566690 100644
--- a/docs/source/ko/preprocessing.md
+++ b/docs/source/ko/preprocessing.md
@@ -220,7 +220,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 오디오 작업은 모델에 맞는 데이터 세트를 준비하기 위해 [특성 추출기](main_classes/feature_extractor)가 필요합니다. 특성 추출기는 원시 오디오 데이터에서 특성를 추출하고 이를 텐서로 변환하는 것이 목적입니다.
 
-오디오 데이터 세트에 특성 추출기를 사용하는 방법을 보기 위해 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트를 가져오세요. (데이터 세트를 가져오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 자세히 설명하고 있습니다.)
+오디오 데이터 세트에 특성 추출기를 사용하는 방법을 보기 위해 [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) 데이터 세트를 가져오세요. (데이터 세트를 가져오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub)에서 자세히 설명하고 있습니다.)
 
 ```py
 >>> from datasets import load_dataset, Audio
@@ -346,7 +346,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 </Tip>
 
 [food101](https://huggingface.co/datasets/food101) 데이터 세트를 가져와서 컴퓨터 비전 데이터 세트에서 이미지 프로세서를 어떻게 사용하는지 알아보세요.
-데이터 세트를 불러오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)을 참고하세요.
+데이터 세트를 불러오는 방법은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub)을 참고하세요.
 
 <Tip>
 
@@ -360,7 +360,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image)로 이미지를 확인해보세요:
+다음으로, 🤗 Datasets의 [`image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image)로 이미지를 확인해보세요:
 
 ```py
 >>> dataset[0]["image"]
@@ -418,7 +418,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 </Tip>
 
-3. 🤗 Datasets의 [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)를 사용하여 실시간으로 변환을 적용합니다:
+3. 🤗 Datasets의 [`set_transform`](https://huggingface.co/docs/datasets/process#format-transform)를 사용하여 실시간으로 변환을 적용합니다:
 
 ```py
 >>> dataset.set_transform(transforms)
@@ -476,7 +476,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 프로세서는 토크나이저와 특성 추출기와 같은 두 가지 처리 객체를 결합합니다.
 
 [LJ Speech](https://huggingface.co/datasets/lj_speech) 데이터 세트를 가져와서 자동 음성 인식(ASR)을 위한 프로세서를 사용하는 방법을 확인하세요.
-(데이터 세트를 가져오는 방법에 대한 자세한 내용은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub.html)에서 볼 수 있습니다.)
+(데이터 세트를 가져오는 방법에 대한 자세한 내용은 🤗 [데이터 세트 튜토리얼](https://huggingface.co/docs/datasets/load_hub)에서 볼 수 있습니다.)
 
 ```py
 >>> from datasets import load_dataset
diff --git a/docs/source/ko/run_scripts.md b/docs/source/ko/run_scripts.md
index c1af1677183bbb..f88e8e8252f970 100644
--- a/docs/source/ko/run_scripts.md
+++ b/docs/source/ko/run_scripts.md
@@ -141,7 +141,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 - `nproc_per_node` 인수를 추가해 사용할 GPU 개수를 설정합니다.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/ko/tasks/language_modeling.md b/docs/source/ko/tasks/language_modeling.md
index ba540825c29521..bf10660c61c188 100644
--- a/docs/source/ko/tasks/language_modeling.md
+++ b/docs/source/ko/tasks/language_modeling.md
@@ -107,7 +107,7 @@ pip install transformers datasets evaluate
 >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
 ```
 
-위의 예제에서 알 수 있듯이, `text` 필드는 `answers` 아래에 중첩되어 있습니다. 따라서 [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) 메소드를 사용하여 중첩 구조에서 `text` 하위 필드를 추출해야 합니다.
+위의 예제에서 알 수 있듯이, `text` 필드는 `answers` 아래에 중첩되어 있습니다. 따라서 [`flatten`](https://huggingface.co/docs/datasets/process#flatten) 메소드를 사용하여 중첩 구조에서 `text` 하위 필드를 추출해야 합니다.
 
 ```py
 >>> eli5 = eli5.flatten()
diff --git a/docs/source/ko/tasks/masked_language_modeling.md b/docs/source/ko/tasks/masked_language_modeling.md
index d22d439dbd514b..ee835d13ebc0b4 100644
--- a/docs/source/ko/tasks/masked_language_modeling.md
+++ b/docs/source/ko/tasks/masked_language_modeling.md
@@ -107,7 +107,7 @@ Hugging Face 계정에 로그인하여 모델을 업로드하고 커뮤니티와
 ```
 
 위의 예제에서와 마찬가지로, `text` 필드는 `answers` 안에 중첩되어 있습니다. 
-따라서 중첩된 구조에서 [`flatten`](https://huggingface.co/docs/datasets/process.html#flatten) 메소드를 사용하여 `text` 하위 필드를 추출합니다:
+따라서 중첩된 구조에서 [`flatten`](https://huggingface.co/docs/datasets/process#flatten) 메소드를 사용하여 `text` 하위 필드를 추출합니다:
 
 ```py
 >>> eli5 = eli5.flatten()
diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md
index ca384d038162c1..0076bba6f8441f 100644
--- a/docs/source/ko/tasks/object_detection.md
+++ b/docs/source/ko/tasks/object_detection.md
@@ -504,7 +504,7 @@ COCO 데이터 세트를 빌드하는 API는 데이터를 특정 형식으로 
 ...         outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask)
 
 ...         orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0)
-...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to COCO api
+...         results = im_processor.post_process(outputs, orig_target_sizes)  # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax)
 
 ...         module.add(prediction=results, reference=labels)
 ...         del batch
diff --git a/docs/source/ko/training.md b/docs/source/ko/training.md
index 4e375f0f721542..f4ab1332294363 100644
--- a/docs/source/ko/training.md
+++ b/docs/source/ko/training.md
@@ -43,7 +43,7 @@ rendered properly in your Markdown viewer.
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
 ```
 
-텍스트를 처리하고 서로 다른 길이의 시퀀스 패딩 및 잘라내기 전략을 포함하려면 토크나이저가 필요합니다. 데이터셋을 한 번에 처리하려면 🤗 Dataset [`map`](https://huggingface.co/docs/datasets/process.html#map) 메서드를 사용하여 전체 데이터셋에 전처리 함수를 적용하세요:
+텍스트를 처리하고 서로 다른 길이의 시퀀스 패딩 및 잘라내기 전략을 포함하려면 토크나이저가 필요합니다. 데이터셋을 한 번에 처리하려면 🤗 Dataset [`map`](https://huggingface.co/docs/datasets/process#map) 메서드를 사용하여 전체 데이터셋에 전처리 함수를 적용하세요:
 
 ```py
 >>> from transformers import AutoTokenizer
diff --git a/docs/source/ms/index.md b/docs/source/ms/index.md
index e57b65fc40c6b9..28ec0aec7540fa 100644
--- a/docs/source/ms/index.md
+++ b/docs/source/ms/index.md
@@ -228,6 +228,7 @@ Dokumentasi disusun kepada lima bahagian:
 1. **[Transformer-XL](model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[TrOCR](model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
 1. **[TVLT](model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[TVP](model_doc/tvp)** (from Intel) released with the paper [Text-Visual Prompting for Efficient 2D Temporal Video Grounding](https://arxiv.org/abs/2303.04995) by Yimeng Zhang, Xin Chen, Jinghan Jia, Sijia Liu, Ke Ding.
 1. **[UL2](model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
 1. **[UniSpeech](model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
 1. **[UniSpeechSat](model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
@@ -431,6 +432,7 @@ Flax), PyTorch, dan/atau TensorFlow.
 |        Transformer-XL         |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 |             TrOCR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |             TVLT              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|              TVP              |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |           UniSpeech           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |         UniSpeechSat          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 |            UPerNet            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
diff --git a/docs/source/pt/converting_tensorflow_models.md b/docs/source/pt/converting_tensorflow_models.md
index ac1271d2764be4..97767b2ad420db 100644
--- a/docs/source/pt/converting_tensorflow_models.md
+++ b/docs/source/pt/converting_tensorflow_models.md
@@ -109,20 +109,6 @@ transformers-cli convert --model_type gpt2 \
   [--finetuning_task_name OPENAI_GPT2_FINETUNED_TASK]
 ```
 
-## Transformer-XL
-
-Aqui está um exemplo do processo de conversão para um modelo Transformer-XL pré-treinado (consulte [aqui](https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-modelos-sota))
-
-```bash
-export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
-
-transformers-cli convert --model_type transfo_xl \
-  --tf_checkpoint $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
-  --pytorch_dump_output $PYTORCH_DUMP_OUTPUT \
-  [--config TRANSFO_XL_CONFIG] \
-  [--finetuning_task_name TRANSFO_XL_FINETUNED_TASK]
-```
-
 ## XLNet
 
 Aqui está um exemplo do processo de conversão para um modelo XLNet pré-treinado:
diff --git a/docs/source/pt/quicktour.md b/docs/source/pt/quicktour.md
index fd89b2485599da..9ecb760e6969b6 100644
--- a/docs/source/pt/quicktour.md
+++ b/docs/source/pt/quicktour.md
@@ -119,7 +119,7 @@ Crie uma [`pipeline`] com a tarefa que deseja resolver e o modelo que deseja usa
 >>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
 ```
 
-A seguir, carregue uma base de dados (confira a 🤗 [Iniciação em Datasets](https://huggingface.co/docs/datasets/quickstart.html) para mais detalhes) que você gostaria de iterar sobre. Por exemplo, vamos carregar o dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
+A seguir, carregue uma base de dados (confira a 🤗 [Iniciação em Datasets](https://huggingface.co/docs/datasets/quickstart) para mais detalhes) que você gostaria de iterar sobre. Por exemplo, vamos carregar o dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14):
 
 ```py
 >>> from datasets import load_dataset, Audio
diff --git a/docs/source/pt/run_scripts.md b/docs/source/pt/run_scripts.md
index 8d87c10c271334..ff3110817e8ae7 100644
--- a/docs/source/pt/run_scripts.md
+++ b/docs/source/pt/run_scripts.md
@@ -131,7 +131,7 @@ O [Trainer](https://huggingface.co/docs/transformers/main_classes/trainer) ofere
 - Defina o número de GPUs a serem usadas com o argumento `nproc_per_node`.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/pt/tasks/sequence_classification.md b/docs/source/pt/tasks/sequence_classification.md
index 6469ac4d45534c..02647f68f8866f 100644
--- a/docs/source/pt/tasks/sequence_classification.md
+++ b/docs/source/pt/tasks/sequence_classification.md
@@ -148,7 +148,7 @@ O [`Trainer`] aplicará o preenchimento dinâmico por padrão quando você defin
 </Tip>
 </pt>
 <tf>
-Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
+Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
 
 ```py
 >>> tf_train_set = tokenized_imdb["train"].to_tf_dataset(
diff --git a/docs/source/pt/tasks/token_classification.md b/docs/source/pt/tasks/token_classification.md
index ba8298e9f581d9..316d6a8102180a 100644
--- a/docs/source/pt/tasks/token_classification.md
+++ b/docs/source/pt/tasks/token_classification.md
@@ -201,7 +201,7 @@ Nesse ponto, restam apenas três passos:
 ```
 </pt>
 <tf>
-Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
+Para executar o fine-tuning de um modelo no TensorFlow, comece convertendo seu conjunto de dados para o formato `tf.data.Dataset` com [`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset). Nessa execução você deverá especificar as entradas e rótulos (no parâmetro `columns`), se deseja embaralhar o conjunto de dados, o tamanho do batch e o data collator:
 
 ```py
 >>> tf_train_set = tokenized_wnut["train"].to_tf_dataset(
diff --git a/docs/source/pt/training.md b/docs/source/pt/training.md
index aa529ac948b82d..6e39a46b16432d 100644
--- a/docs/source/pt/training.md
+++ b/docs/source/pt/training.md
@@ -52,7 +52,7 @@ Comece carregando o dataset [Yelp Reviews](https://huggingface.co/datasets/yelp_
 
 Como já sabe, é necessário ter um tokenizador para processar o texto e incluir uma estratégia de padding e truncamento,
 para manejar qualquer tamanho varíavel de sequência. Para processar o seu dataset em apenas um passo, utilize o método de
-🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process.html#map) para aplicar uma função de preprocessamento sobre
+🤗 Datasets [`map`](https://huggingface.co/docs/datasets/process#map) para aplicar uma função de preprocessamento sobre
 todo o dataset.
 
 ```py
@@ -126,7 +126,7 @@ Especifique onde salvar os checkpoints do treinamento:
 O [`Trainer`] não avalia automaticamente o rendimento do modelo durante o treinamento. Será necessário passar ao
 [`Trainer`] uma função para calcular e fazer um diagnóstico sobre as métricas. A biblioteca 🤗 Datasets proporciona
 uma função de [`accuracy`](https://huggingface.co/metrics/accuracy) simples que pode ser carregada com a função
-`load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics.html) para mais informações):
+`load_metric` (ver este [tutorial](https://huggingface.co/docs/datasets/metrics) para mais informações):
 
 ```py
 >>> import numpy as np
@@ -203,7 +203,7 @@ Assegure-se de especificar os `return_tensors` para retornar os tensores do Tens
 </Tip>
 
 Em seguida, converta os datasets tokenizados em datasets do TensorFlow com o método
-[`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.to_tf_dataset).
+[`to_tf_dataset`](https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.to_tf_dataset).
 Especifique suas entradas em `columns` e seu rótulo em `label_cols`:
 
 ```py
@@ -385,7 +385,7 @@ uma barra de progresso sobre o número de passos percorridos no treinamento atua
 
 Da mesma forma que é necessário adicionar uma função de avaliação ao [`Trainer`], é necessário fazer o mesmo quando
 escrevendo o próprio ciclo de treinamento. Contudo, em vez de calcular e retornar a métrica final de cada época,
-você deverá adicionar todos os batches com [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=add_batch#datasets.Metric.add_batch)
+você deverá adicionar todos os batches com [`add_batch`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=add_batch#datasets.Metric.add_batch)
 e calcular a métrica apenas no final.
 
 ```py
diff --git a/docs/source/zh/_toctree.yml b/docs/source/zh/_toctree.yml
index 914ce68fd26dee..7cf2f1dc55a9f3 100644
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@@ -68,7 +68,66 @@
   title: 概念指南
 - sections:
   - sections:
+    - local: main_classes/agent
+      title: Agents和工具
+    - local: main_classes/callback
+      title: Callbacks
+    - local: main_classes/configuration
+      title: Configuration
+    - local: main_classes/data_collator
+      title: Data Collator
+    - local: main_classes/keras_callbacks
+      title: Keras callbacks
+    - local: main_classes/logging
+      title: Logging
     - local: main_classes/model
       title: 模型
+    - local: main_classes/text_generation
+      title: 文本生成
+    - local: main_classes/onnx
+      title: ONNX
+    - local: main_classes/optimizer_schedules
+      title: Optimization
+    - local: main_classes/output
+      title: 模型输出
+    - local: main_classes/pipelines
+      title: Pipelines
+    - local: main_classes/processors
+      title: Processors
+    - local: main_classes/quantization
+      title: Quantization
+    - local: main_classes/tokenizer
+      title: Tokenizer
+    - local: main_classes/trainer
+      title: Trainer
+    - local: main_classes/deepspeed
+      title: DeepSpeed集成
+    - local: main_classes/feature_extractor
+      title: Feature Extractor
+    - local: main_classes/image_processor
+      title: Image Processor
     title: 主要类
-  title: 应用程序接口 (API) 
\ No newline at end of file
+  - sections:
+    - local: internal/modeling_utils
+      title: 自定义层和工具
+    - local: internal/pipelines_utils
+      title: pipelines工具
+    - local: internal/tokenization_utils
+      title: Tokenizers工具
+    - local: internal/trainer_utils
+      title: 训练器工具
+    - local: internal/generation_utils
+      title: 生成工具
+    - local: internal/image_processing_utils
+      title: 图像处理工具
+    - local: internal/audio_utils
+      title: 音频处理工具
+    - local: internal/file_utils
+      title: 通用工具
+    - local: internal/time_series_utils
+      title: 时序数据工具
+    title: 内部辅助工具
+  title: 应用程序接口 (API) 
+
+
+
diff --git a/docs/source/zh/internal/audio_utils.md b/docs/source/zh/internal/audio_utils.md
new file mode 100644
index 00000000000000..17fc430f984287
--- /dev/null
+++ b/docs/source/zh/internal/audio_utils.md
@@ -0,0 +1,40 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+#  `FeatureExtractors`的工具
+
+此页面列出了音频 [`FeatureExtractor`] 可以使用的所有实用函数，以便使用常见的算法（如 *Short Time Fourier Transform* 或 *log mel spectrogram*）从原始音频中计算特殊特征。
+
+其中大多数仅在您研究库中音频processors的代码时有用。
+
+
+## 音频转换
+
+[[autodoc]] audio_utils.hertz_to_mel
+
+[[autodoc]] audio_utils.mel_to_hertz
+
+[[autodoc]] audio_utils.mel_filter_bank
+
+[[autodoc]] audio_utils.optimal_fft_length
+
+[[autodoc]] audio_utils.window_function
+
+[[autodoc]] audio_utils.spectrogram
+
+[[autodoc]] audio_utils.power_to_db
+
+[[autodoc]] audio_utils.amplitude_to_db
diff --git a/docs/source/zh/internal/file_utils.md b/docs/source/zh/internal/file_utils.md
new file mode 100644
index 00000000000000..ba4b4902814a65
--- /dev/null
+++ b/docs/source/zh/internal/file_utils.md
@@ -0,0 +1,50 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 通用工具
+
+此页面列出了在`utils.py`文件中找到的所有Transformers通用实用函数。
+
+其中大多数仅在您研究库中的通用代码时才有用。
+
+
+## Enums和namedtuples(命名元组)
+
+[[autodoc]] utils.ExplicitEnum
+
+[[autodoc]] utils.PaddingStrategy
+
+[[autodoc]] utils.TensorType
+
+## 特殊的装饰函数
+
+[[autodoc]] utils.add_start_docstrings
+
+[[autodoc]] utils.add_start_docstrings_to_model_forward
+
+[[autodoc]] utils.add_end_docstrings
+
+[[autodoc]] utils.add_code_sample_docstrings
+
+[[autodoc]] utils.replace_return_docstrings
+
+## 特殊的属性
+
+[[autodoc]] utils.cached_property
+
+## 其他实用程序
+
+[[autodoc]] utils._LazyModule
diff --git a/docs/source/zh/internal/generation_utils.md b/docs/source/zh/internal/generation_utils.md
new file mode 100644
index 00000000000000..d8013ac87dcb21
--- /dev/null
+++ b/docs/source/zh/internal/generation_utils.md
@@ -0,0 +1,364 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 用于生成的工具
+
+此页面列出了所有由 [`~generation.GenerationMixin.generate`],
+[`~generation.GenerationMixin.greedy_search`],
+[`~generation.GenerationMixin.contrastive_search`],
+[`~generation.GenerationMixin.sample`],
+[`~generation.GenerationMixin.beam_search`],
+[`~generation.GenerationMixin.beam_sample`],
+[`~generation.GenerationMixin.group_beam_search`], 和
+[`~generation.GenerationMixin.constrained_beam_search`]使用的实用函数。
+
+其中大多数仅在您研究库中生成方法的代码时才有用。
+
+## 生成输出
+
+[`~generation.GenerationMixin.generate`] 的输出是 [`~utils.ModelOutput`] 的一个子类的实例。这个输出是一种包含 [`~generation.GenerationMixin.generate`] 返回的所有信息数据结构，但也可以作为元组或字典使用。
+这里是一个例子：
+
+
+```python
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+model = GPT2LMHeadModel.from_pretrained("gpt2")
+
+inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+```
+
+`generation_output` 的对象是 [`~generation.GreedySearchDecoderOnlyOutput`] 的一个实例，从该类的文档中我们可以看到，这意味着它具有以下属性：
+
+- `sequences`: 生成的tokens序列
+- `scores`（可选）: 每个生成步骤的语言建模头的预测分数
+- `hidden_states`（可选）: 每个生成步骤模型的hidden states
+- `attentions`（可选）: 每个生成步骤模型的注意力权重
+
+在这里，由于我们传递了 `output_scores=True`，我们具有 `scores` 属性。但我们没有 `hidden_states` 和 `attentions`，因为没有传递 `output_hidden_states=True` 或 `output_attentions=True`。
+
+您可以像通常一样访问每个属性，如果该属性未被模型返回，则将获得 `None`。例如，在这里 `generation_output.scores` 是语言建模头的所有生成预测分数，而 `generation_output.attentions` 为 `None`。
+
+当我们将 `generation_output` 对象用作元组时，它只保留非 `None` 值的属性。例如，在这里它有两个元素，`loss` 然后是 `logits`，所以
+
+
+```python
+generation_output[:2]
+```
+
+将返回元组`(generation_output.sequences, generation_output.scores)`。
+
+当我们将`generation_output`对象用作字典时，它只保留非`None`的属性。例如，它有两个键，分别是`sequences`和`scores`。
+
+我们在此记录所有输出类型。
+
+
+### PyTorch
+
+[[autodoc]] generation.GreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation.GreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation.SampleEncoderDecoderOutput
+
+[[autodoc]] generation.SampleDecoderOnlyOutput
+
+[[autodoc]] generation.BeamSearchEncoderDecoderOutput
+
+[[autodoc]] generation.BeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation.BeamSampleEncoderDecoderOutput
+
+[[autodoc]] generation.BeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput
+
+[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput
+
+### TensorFlow
+
+[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput
+
+[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput
+
+[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput
+
+[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput
+
+[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput
+
+### FLAX
+
+[[autodoc]] generation.FlaxSampleOutput
+
+[[autodoc]] generation.FlaxGreedySearchOutput
+
+[[autodoc]] generation.FlaxBeamSearchOutput
+
+## LogitsProcessor
+
+[`LogitsProcessor`] 可以用于修改语言模型头的预测分数以进行生成
+
+
+### PyTorch
+
+[[autodoc]] AlternatingCodebooksLogitsProcessor
+    - __call__
+
+[[autodoc]] ClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] EncoderRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] EpsilonLogitsWarper
+    - __call__
+
+[[autodoc]] EtaLogitsWarper
+    - __call__
+
+[[autodoc]] ExponentialDecayLengthPenalty
+    - __call__
+
+[[autodoc]] ForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] ForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] HammingDiversityLogitsProcessor
+    - __call__
+
+[[autodoc]] InfNanRemoveLogitsProcessor
+    - __call__
+
+[[autodoc]] LogitNormalization
+    - __call__
+
+[[autodoc]] LogitsProcessor
+    - __call__
+
+[[autodoc]] LogitsProcessorList
+    - __call__
+
+[[autodoc]] LogitsWarper
+    - __call__
+
+[[autodoc]] MinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] MinNewTokensLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] NoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] NoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] PrefixConstrainedLogitsProcessor
+    - __call__
+
+[[autodoc]] RepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] SequenceBiasLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] SuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TopKLogitsWarper
+    - __call__
+
+[[autodoc]] TopPLogitsWarper
+    - __call__
+
+[[autodoc]] TypicalLogitsWarper
+    - __call__
+
+[[autodoc]] UnbatchedClassifierFreeGuidanceLogitsProcessor
+    - __call__
+
+[[autodoc]] WhisperTimeStampLogitsProcessor
+    - __call__
+
+### TensorFlow
+
+[[autodoc]] TFForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] TFForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessor
+    - __call__
+
+[[autodoc]] TFLogitsProcessorList
+    - __call__
+
+[[autodoc]] TFLogitsWarper
+    - __call__
+
+[[autodoc]] TFMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoBadWordsLogitsProcessor
+    - __call__
+
+[[autodoc]] TFNoRepeatNGramLogitsProcessor
+    - __call__
+
+[[autodoc]] TFRepetitionPenaltyLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] TFSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] TFTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopKLogitsWarper
+    - __call__
+
+[[autodoc]] TFTopPLogitsWarper
+    - __call__
+
+### FLAX
+
+[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxForceTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxLogitsProcessorList
+    - __call__
+
+[[autodoc]] FlaxLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxMinLengthLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxSuppressTokensLogitsProcessor
+    - __call__
+
+[[autodoc]] FlaxTemperatureLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopKLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxTopPLogitsWarper
+    - __call__
+
+[[autodoc]] FlaxWhisperTimeStampLogitsProcessor
+    - __call__
+
+## StoppingCriteria
+
+可以使用[`StoppingCriteria`]来更改停止生成的时间（除了EOS token以外的方法）。请注意，这仅适用于我们的PyTorch实现。
+
+
+[[autodoc]] StoppingCriteria
+    - __call__
+
+[[autodoc]] StoppingCriteriaList
+    - __call__
+
+[[autodoc]] MaxLengthCriteria
+    - __call__
+
+[[autodoc]] MaxTimeCriteria
+    - __call__
+
+## Constraints
+
+可以使用[`Constraint`]来强制生成结果包含输出中的特定tokens或序列。请注意，这仅适用于我们的PyTorch实现。
+
+[[autodoc]] Constraint
+
+[[autodoc]] PhrasalConstraint
+
+[[autodoc]] DisjunctiveConstraint
+
+[[autodoc]] ConstraintListState
+
+## BeamSearch
+
+[[autodoc]] BeamScorer
+    - process
+    - finalize
+
+[[autodoc]] BeamSearchScorer
+    - process
+    - finalize
+
+[[autodoc]] ConstrainedBeamSearchScorer
+    - process
+    - finalize
+
+## Utilities
+
+[[autodoc]] top_k_top_p_filtering
+
+[[autodoc]] tf_top_k_top_p_filtering
+
+## Streamers
+
+[[autodoc]] TextStreamer
+
+[[autodoc]] TextIteratorStreamer
diff --git a/docs/source/zh/internal/image_processing_utils.md b/docs/source/zh/internal/image_processing_utils.md
new file mode 100644
index 00000000000000..b3c784fa345237
--- /dev/null
+++ b/docs/source/zh/internal/image_processing_utils.md
@@ -0,0 +1,48 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image Processors的工具
+
+此页面列出了image processors使用的所有实用函数功能，主要是用于处理图像的功能变换。
+
+其中大多数仅在您研究库中image processors的代码时有用。
+
+
+## 图像转换
+
+[[autodoc]] image_transforms.center_crop
+
+[[autodoc]] image_transforms.center_to_corners_format
+
+[[autodoc]] image_transforms.corners_to_center_format
+
+[[autodoc]] image_transforms.id_to_rgb
+
+[[autodoc]] image_transforms.normalize
+
+[[autodoc]] image_transforms.pad
+
+[[autodoc]] image_transforms.rgb_to_id
+
+[[autodoc]] image_transforms.rescale
+
+[[autodoc]] image_transforms.resize
+
+[[autodoc]] image_transforms.to_pil_image
+
+## ImageProcessingMixin
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
diff --git a/docs/source/zh/internal/modeling_utils.md b/docs/source/zh/internal/modeling_utils.md
new file mode 100644
index 00000000000000..93341b323e836b
--- /dev/null
+++ b/docs/source/zh/internal/modeling_utils.md
@@ -0,0 +1,83 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 自定义层和工具
+
+此页面列出了库使用的所有自定义层，以及它为模型提供的实用函数。
+
+其中大多数只有在您研究库中模型的代码时才有用。
+
+
+## Pytorch自定义模块
+
+[[autodoc]] pytorch_utils.Conv1D
+
+[[autodoc]] modeling_utils.PoolerStartLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerEndLogits
+    - forward
+
+[[autodoc]] modeling_utils.PoolerAnswerClass
+    - forward
+
+[[autodoc]] modeling_utils.SquadHeadOutput
+
+[[autodoc]] modeling_utils.SQuADHead
+    - forward
+
+[[autodoc]] modeling_utils.SequenceSummary
+    - forward
+
+## PyTorch帮助函数
+
+[[autodoc]] pytorch_utils.apply_chunking_to_forward
+
+[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices
+
+[[autodoc]] pytorch_utils.prune_layer
+
+[[autodoc]] pytorch_utils.prune_conv1d_layer
+
+[[autodoc]] pytorch_utils.prune_linear_layer
+
+## TensorFlow自定义层
+
+[[autodoc]] modeling_tf_utils.TFConv1D
+
+[[autodoc]] modeling_tf_utils.TFSequenceSummary
+
+## TensorFlow loss 函数
+
+[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
+
+[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
+
+[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
+
+[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
+
+[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
+
+## TensorFlow帮助函数
+
+[[autodoc]] modeling_tf_utils.get_initializer
+
+[[autodoc]] modeling_tf_utils.keras_serializable
+
+[[autodoc]] modeling_tf_utils.shape_list
diff --git a/docs/source/zh/internal/pipelines_utils.md b/docs/source/zh/internal/pipelines_utils.md
new file mode 100644
index 00000000000000..30fdb8cd1d4006
--- /dev/null
+++ b/docs/source/zh/internal/pipelines_utils.md
@@ -0,0 +1,45 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# pipelines的工具
+
+
+此页面列出了库为pipelines提供的所有实用程序功能。
+
+其中大多数只有在您研究库中模型的代码时才有用。
+
+
+## 参数处理
+
+[[autodoc]] pipelines.ArgumentHandler
+
+[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
+
+[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
+
+## 数据格式
+
+[[autodoc]] pipelines.PipelineDataFormat
+
+[[autodoc]] pipelines.CsvPipelineDataFormat
+
+[[autodoc]] pipelines.JsonPipelineDataFormat
+
+[[autodoc]] pipelines.PipedPipelineDataFormat
+
+## 实用函数
+
+[[autodoc]] pipelines.PipelineException
diff --git a/docs/source/zh/internal/time_series_utils.md b/docs/source/zh/internal/time_series_utils.md
new file mode 100644
index 00000000000000..4b9093fbf478c5
--- /dev/null
+++ b/docs/source/zh/internal/time_series_utils.md
@@ -0,0 +1,31 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 时间序列工具
+
+
+此页面列出了可用于时间序列类模型的所有实用函数和类。
+
+其中大多数仅在您研究时间序列模型的代码，或希望添加到分布输出类集合时有用。
+
+
+## 输出分布
+
+[[autodoc]] time_series_utils.NormalOutput
+
+[[autodoc]] time_series_utils.StudentTOutput
+
+[[autodoc]] time_series_utils.NegativeBinomialOutput
diff --git a/docs/source/zh/internal/tokenization_utils.md b/docs/source/zh/internal/tokenization_utils.md
new file mode 100644
index 00000000000000..9f216131c122ef
--- /dev/null
+++ b/docs/source/zh/internal/tokenization_utils.md
@@ -0,0 +1,43 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tokenizers的工具
+
+并保留格式：此页面列出了tokenizers使用的所有实用函数，主要是类
+[`~tokenization_utils_base.PreTrained TokenizerBase`] 实现了常用方法之间的
+[`PreTrained Tokenizer`] 和 [`PreTrained TokenizerFast`] 以及混合类
+[`~tokenization_utils_base.SpecialTokens Mixin`]。
+
+其中大多数只有在您研究库中tokenizers的代码时才有用。
+
+
+## PreTrainedTokenizerBase
+
+[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
+    - __call__
+    - all
+
+## SpecialTokensMixin
+
+[[autodoc]] tokenization_utils_base.SpecialTokensMixin
+
+## Enums和namedtuples(命名元组)
+
+[[autodoc]] tokenization_utils_base.TruncationStrategy
+
+[[autodoc]] tokenization_utils_base.CharSpan
+
+[[autodoc]] tokenization_utils_base.TokenSpan
diff --git a/docs/source/zh/internal/trainer_utils.md b/docs/source/zh/internal/trainer_utils.md
new file mode 100644
index 00000000000000..fc28ba623c9d60
--- /dev/null
+++ b/docs/source/zh/internal/trainer_utils.md
@@ -0,0 +1,50 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer的工具
+
+此页面列出了 [`Trainer`] 使用的所有实用函数。
+
+其中大多数仅在您研究库中Trainer的代码时有用。
+
+
+## 工具
+
+[[autodoc]] EvalPrediction
+
+[[autodoc]] IntervalStrategy
+
+[[autodoc]] enable_full_determinism
+
+[[autodoc]] set_seed
+
+[[autodoc]] torch_distributed_zero_first
+
+## Callbacks内部机制
+
+[[autodoc]] trainer_callback.CallbackHandler
+
+## 分布式评估
+
+[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
+
+## Trainer参数解析
+
+[[autodoc]] HfArgumentParser
+
+## Debug工具
+
+[[autodoc]] debug_utils.DebugUnderflowOverflow
diff --git a/docs/source/zh/main_classes/agent.md b/docs/source/zh/main_classes/agent.md
new file mode 100644
index 00000000000000..8c7cc53fefd6a4
--- /dev/null
+++ b/docs/source/zh/main_classes/agent.md
@@ -0,0 +1,101 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Agents和工具
+
+<Tip warning={true}>
+
+Transformers Agents是一个实验性的API，它随时可能发生变化。由于API或底层模型容易发生变化，因此由agents返回的结果可能会有所不同。
+
+
+</Tip>
+
+要了解更多关于agents和工具的信息，请确保阅读[介绍指南](../transformers_agents)。此页面包含底层类的API文档。
+
+
+## Agents
+
+我们提供三种类型的agents：[`HfAgent`]使用开源模型的推理端点，[`LocalAgent`]使用您在本地选择的模型，[`OpenAiAgent`]使用OpenAI封闭模型。
+
+
+### HfAgent
+
+[[autodoc]] HfAgent
+
+### LocalAgent
+
+[[autodoc]] LocalAgent
+
+### OpenAiAgent
+
+[[autodoc]] OpenAiAgent
+
+### AzureOpenAiAgent
+
+[[autodoc]] AzureOpenAiAgent
+
+### Agent
+
+[[autodoc]] Agent 
+    - chat 
+    - run 
+    - prepare_for_new_chat
+
+## 工具
+
+### load_tool
+
+[[autodoc]] load_tool
+
+### Tool
+
+[[autodoc]] Tool
+
+### PipelineTool
+
+[[autodoc]] PipelineTool
+
+### RemoteTool
+
+[[autodoc]] RemoteTool
+
+### launch_gradio_demo
+
+[[autodoc]] launch_gradio_demo
+
+## Agent类型
+
+Agents可以处理工具之间任何类型的对象；工具是多模态的，可以接受和返回文本、图像、音频、视频等类型。为了增加工具之间的兼容性，以及正确地在ipython（jupyter、colab、ipython notebooks等）中呈现这些返回值，我们实现了这些类型的包装类。
+
+被包装的对象应该继续按照最初的行为方式运作；文本对象应该仍然像字符串一样运作，图像对象应该仍然像`PIL.Image`一样运作。
+
+这些类型有三个特定目的：
+
+- 对类型调用 `to_raw` 应该返回底层对象
+- 对类型调用 `to_string` 应该将对象作为字符串返回：在`AgentText`的情况下可能是字符串，但在其他情况下可能是对象序列化版本的路径
+- 在ipython内核中显示它应该正确显示对象
+
+### AgentText
+
+[[autodoc]] transformers.tools.agent_types.AgentText
+
+### AgentImage
+
+[[autodoc]] transformers.tools.agent_types.AgentImage
+
+### AgentAudio
+
+[[autodoc]] transformers.tools.agent_types.AgentAudio
diff --git a/docs/source/zh/main_classes/callback.md b/docs/source/zh/main_classes/callback.md
new file mode 100644
index 00000000000000..be05c37aec9e73
--- /dev/null
+++ b/docs/source/zh/main_classes/callback.md
@@ -0,0 +1,125 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Callbacks
+
+
+Callbacks可以用来自定义PyTorch [Trainer]中训练循环行为的对象（此功能尚未在TensorFlow中实现），该对象可以检查训练循环状态（用于进度报告、在TensorBoard或其他ML平台上记录日志等），并做出决策（例如提前停止）。
+
+Callbacks是“只读”的代码片段，除了它们返回的[TrainerControl]对象外，它们不能更改训练循环中的任何内容。对于需要更改训练循环的自定义，您应该继承[Trainer]并重载您需要的方法（有关示例，请参见[trainer](trainer)）。
+
+默认情况下，`TrainingArguments.report_to` 设置为"all"，然后[Trainer]将使用以下callbacks。
+
+
+- [`DefaultFlowCallback`]，它处理默认的日志记录、保存和评估行为
+- [`PrinterCallback`] 或 [`ProgressCallback`]，用于显示进度和打印日志（如果通过[`TrainingArguments`]停用tqdm，则使用第一个函数；否则使用第二个）。
+- [`~integrations.TensorBoardCallback`]，如果TensorBoard可访问（通过PyTorch版本 >= 1.4 或者 tensorboardX）。
+- [`~integrations.WandbCallback`]，如果安装了[wandb](https://www.wandb.com/)。
+- [`~integrations.CometCallback`]，如果安装了[comet_ml](https://www.comet.ml/site/)。
+- [`~integrations.MLflowCallback`]，如果安装了[mlflow](https://www.mlflow.org/)。
+- [`~integrations.NeptuneCallback`]，如果安装了[neptune](https://neptune.ai/)。
+- [`~integrations.AzureMLCallback`]，如果安装了[azureml-sdk](https://pypi.org/project/azureml-sdk/)。
+- [`~integrations.CodeCarbonCallback`]，如果安装了[codecarbon](https://pypi.org/project/codecarbon/)。
+- [`~integrations.ClearMLCallback`]，如果安装了[clearml](https://github.com/allegroai/clearml)。
+- [`~integrations.DagsHubCallback`]，如果安装了[dagshub](https://dagshub.com/)。
+- [`~integrations.FlyteCallback`]，如果安装了[flyte](https://flyte.org/)。
+- [`~integrations.DVCLiveCallback`]，如果安装了[dvclive](https://dvc.org/doc/dvclive)。
+
+如果安装了一个软件包，但您不希望使用相关的集成，您可以将 `TrainingArguments.report_to` 更改为仅包含您想要使用的集成的列表（例如 `["azure_ml", "wandb"]`）。
+
+实现callbacks的主要类是[`TrainerCallback`]。它获取用于实例化[`Trainer`]的[`TrainingArguments`]，可以通过[`TrainerState`]访问该Trainer的内部状态，并可以通过[`TrainerControl`]对训练循环执行一些操作。
+
+
+## 可用的Callbacks
+
+这里是库里可用[`TrainerCallback`]的列表：
+
+[[autodoc]] integrations.CometCallback 
+    - setup
+
+[[autodoc]] DefaultFlowCallback
+
+[[autodoc]] PrinterCallback
+
+[[autodoc]] ProgressCallback
+
+[[autodoc]] EarlyStoppingCallback
+
+[[autodoc]] integrations.TensorBoardCallback
+
+[[autodoc]] integrations.WandbCallback 
+    - setup
+
+[[autodoc]] integrations.MLflowCallback 
+    - setup
+
+[[autodoc]] integrations.AzureMLCallback
+
+[[autodoc]] integrations.CodeCarbonCallback
+
+[[autodoc]] integrations.NeptuneCallback
+
+[[autodoc]] integrations.ClearMLCallback
+
+[[autodoc]] integrations.DagsHubCallback
+
+[[autodoc]] integrations.FlyteCallback
+
+[[autodoc]] integrations.DVCLiveCallback
+    - setup
+
+## TrainerCallback
+
+[[autodoc]] TrainerCallback
+
+以下是如何使用PyTorch注册自定义callback的示例：
+
+[`Trainer`]:
+
+```python
+class MyCallback(TrainerCallback):
+    "A callback that prints a message at the beginning of training"
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        print("Starting training")
+
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
+)
+```
+
+注册callback的另一种方式是调用 `trainer.add_callback()`，如下所示：
+
+
+```python
+trainer = Trainer(...)
+trainer.add_callback(MyCallback)
+# Alternatively, we can pass an instance of the callback class
+trainer.add_callback(MyCallback())
+```
+
+## TrainerState
+
+[[autodoc]] TrainerState
+
+## TrainerControl
+
+[[autodoc]] TrainerControl
diff --git a/docs/source/zh/main_classes/configuration.md b/docs/source/zh/main_classes/configuration.md
new file mode 100644
index 00000000000000..755a3170419bf4
--- /dev/null
+++ b/docs/source/zh/main_classes/configuration.md
@@ -0,0 +1,28 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Configuration
+
+基类[`PretrainedConfig`]实现了从本地文件或目录加载/保存配置的常见方法，或下载库提供的预训练模型配置（从HuggingFace的AWS S3库中下载）。
+
+每个派生的配置类都实现了特定于模型的属性。所有配置类中共同存在的属性有：`hidden_size`、`num_attention_heads` 和 `num_hidden_layers`。文本模型进一步添加了 `vocab_size`。
+
+
+## PretrainedConfig
+
+[[autodoc]] PretrainedConfig
+    - push_to_hub
+    - all
diff --git a/docs/source/zh/main_classes/data_collator.md b/docs/source/zh/main_classes/data_collator.md
new file mode 100644
index 00000000000000..d947b53ea14cb2
--- /dev/null
+++ b/docs/source/zh/main_classes/data_collator.md
@@ -0,0 +1,65 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Data Collator
+
+Data collators是一个对象，通过使用数据集元素列表作为输入来形成一个批次。这些元素与 `train_dataset` 或 `eval_dataset` 的元素类型相同。
+
+为了能够构建批次，Data collators可能会应用一些预处理（比如填充）。其中一些（比如[`DataCollatorForLanguageModeling`]）还会在形成的批次上应用一些随机数据增强（比如随机掩码）。
+
+在[示例脚本](../examples)或[示例notebooks](../notebooks)中可以找到使用的示例。
+
+
+## Default data collator
+
+[[autodoc]] data.data_collator.default_data_collator
+
+## DefaultDataCollator
+
+[[autodoc]] data.data_collator.DefaultDataCollator
+
+## DataCollatorWithPadding
+
+[[autodoc]] data.data_collator.DataCollatorWithPadding
+
+## DataCollatorForTokenClassification
+
+[[autodoc]] data.data_collator.DataCollatorForTokenClassification
+
+## DataCollatorForSeq2Seq
+
+[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
+
+## DataCollatorForLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForWholeWordMask
+
+[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForPermutationLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
diff --git a/docs/source/zh/main_classes/deepspeed.md b/docs/source/zh/main_classes/deepspeed.md
new file mode 100644
index 00000000000000..c9f9781b65f41a
--- /dev/null
+++ b/docs/source/zh/main_classes/deepspeed.md
@@ -0,0 +1,2101 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSpeed集成
+
+[DeepSpeed](https://github.com/microsoft/DeepSpeed)实现了[ZeRO论文](https://arxiv.org/abs/1910.02054)中描述的所有内容。目前，它提供对以下功能的全面支持：
+
+1. 优化器状态分区（ZeRO stage 1）
+2. 梯度分区（ZeRO stage 2）
+3. 参数分区（ZeRO stage 3）
+4. 自定义混合精度训练处理
+5. 一系列基于CUDA扩展的快速优化器
+6. ZeRO-Offload 到 CPU 和 NVMe
+
+ZeRO-Offload有其自己的专门论文：[ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)。而NVMe支持在论文[ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)中进行了描述。
+
+DeepSpeed ZeRO-2主要用于训练，因为它的特性对推理没有用处。
+
+DeepSpeed ZeRO-3也可以用于推理，因为它允许将单个GPU无法加载的大模型加载到多个GPU上。
+
+🤗 Transformers通过以下两种方式集成了[DeepSpeed](https://github.com/microsoft/DeepSpeed)：
+
+1. 通过[`Trainer`]集成核心的DeepSpeed功能。这是一种“为您完成一切”式的集成 - 您只需提供自定义配置文件或使用我们的模板配置文件。本文档的大部分内容都集中在这个功能上。
+2. 如果您不使用[`Trainer`]并希望在自己的Trainer中集成DeepSpeed，那么像`from_pretrained`和`from_config`这样的核心功能函数将包括ZeRO stage 3及以上的DeepSpeed的基础部分，如`zero.Init`。要利用此功能，请阅读有关[非Trainer DeepSpeed集成](#nontrainer-deepspeed-integration)的文档。
+
+集成的内容：
+
+训练：
+
+1. DeepSpeed ZeRO训练支持完整的ZeRO stages 1、2和3，以及ZeRO-Infinity（CPU和NVMe offload）。
+
+推理：
+
+1. DeepSpeed ZeRO推理支持ZeRO stage 3和ZeRO-Infinity。它使用与训练相同的ZeRO协议，但不使用优化器和学习率调度器，只有stage 3与推理相关。更多详细信息请参阅：[zero-inference](#zero-inference)。
+
+此外还有DeepSpeed推理 - 这是一种完全不同的技术，它使用张量并行而不是ZeRO（即将推出）。
+
+
+<a id='deepspeed-trainer-integration'></a>
+
+
+## Trainer DeepSpeed 集成
+
+
+<a id='deepspeed-installation'></a>
+
+### 安装
+
+通过pypi安装库：
+
+
+```bash
+pip install deepspeed
+```
+
+或通过 `transformers` 的 `extras`安装：
+
+```bash
+pip install transformers[deepspeed]
+```
+
+或在 [DeepSpeed 的 GitHub 页面](https://github.com/microsoft/deepspeed#installation) 和
+[高级安装](https://www.deepspeed.ai/tutorials/advanced-install/) 中查找更多详细信息。
+
+如果构建过程中仍然遇到问题，请首先确保阅读 [CUDA 扩展安装注意事项](trainer#cuda-extension-installation-notes)。
+
+如果您没有预先构建扩展而是在运行时构建它们，而且您尝试了以上所有解决方案都无效，下一步可以尝试在安装之前预先构建扩展。
+
+进行 DeepSpeed 的本地构建：
+
+
+```bash
+git clone https://github.com/microsoft/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
+--global-option="build_ext" --global-option="-j8" --no-cache -v \
+--disable-pip-version-check 2>&1 | tee build.log
+```
+
+如果您打算使用 NVMe offload，您还需要在上述说明中添加 `DS_BUILD_AIO=1`（并且还需要在系统范围内安装 *libaio-dev*）。
+
+编辑 `TORCH_CUDA_ARCH_LIST` 以插入您打算使用的 GPU 卡的架构代码。假设您的所有卡都是相同的，您可以通过以下方式获取架构：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
+```
+
+因此，如果您得到 `8, 6`，则使用 `TORCH_CUDA_ARCH_LIST="8.6"`。如果您有多个不同的卡，您可以像这样列出所有卡 `TORCH_CUDA_ARCH_LIST="6.1;8.6"`。
+
+如果您需要在多台机器上使用相同的设置，请创建一个二进制 wheel：
+
+
+```bash
+git clone https://github.com/microsoft/DeepSpeed/
+cd DeepSpeed
+rm -rf build
+TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
+python setup.py build_ext -j8 bdist_wheel
+```
+
+它将生成类似于 `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl` 的文件，现在您可以在本地或任何其他机器上安装它，如 `pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`。
+
+再次提醒确保调整 `TORCH_CUDA_ARCH_LIST` 以匹配目标架构。
+
+您可以在[这里](https://developer.nvidia.com/cuda-gpus)找到完整的 NVIDIA GPU 列表及其对应的 **计算能力**（与此上下文中的架构相同）。
+
+您可以使用以下命令检查 PyTorch 构建时使用的架构：
+
+
+```bash
+python -c "import torch; print(torch.cuda.get_arch_list())"
+```
+
+以下是如何查找已安装 GPU 中的一张卡的架构。例如，对于 GPU 0：
+
+```bash
+CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
+print(torch.cuda.get_device_properties(torch.device('cuda')))"
+```
+
+如果输出结果如下：
+
+```bash
+_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)
+```
+
+然后您就知道这张卡的架构是 `8.6`。
+
+您也可以完全省略 `TORCH_CUDA_ARCH_LIST`，然后构建程序将自动查询构建所在的 GPU 的架构。这可能与目标机器上的 GPU 不匹配，因此最好明确指定所需的架构。
+
+如果尝试了所有建议的方法仍然遇到构建问题，请继续在 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues)的 GitHub Issue 上提交问题。
+
+
+<a id='deepspeed-multi-gpu'></a>
+
+### 多GPU启用
+
+为了启用DeepSpeed 集成，调整 [`Trainer`] 的命令行参数，添加一个新的参数 `--deepspeed ds_config.json`，其中 `ds_config.json` 是 DeepSpeed 配置文件，如文档 [这里](https://www.deepspeed.ai/docs/config-json/) 所述。文件命名由您决定。
+建议使用 DeepSpeed 的 `add_config_arguments` 程序将必要的命令行参数添加到您的代码中。
+有关更多信息，请参阅 [DeepSpeed 的参数解析](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) 文档。
+
+在这里，您可以使用您喜欢的启动器。您可以继续使用 PyTorch 启动器：
+
+
+```bash
+torch.distributed.run --nproc_per_node=2 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+或使用由 `deepspeed` 提供的启动器：
+
+
+```bash
+deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+
+正如您所见，这两个启动器的参数不同，但对于大多数需求，任何一个都可以满足工作需求。有关如何配置各个节点和 GPU 的完整详细信息，请查看 [此处](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)。
+
+当您使用 `deepspeed` 启动器并且希望使用所有可用的 GPU 时，您可以简单地省略 `--num_gpus` 标志。
+
+以下是在 DeepSpeed 中启用使用所有可用 GPU情况下， 运行 `run_translation.py` 的示例：
+
+
+```bash
+deepspeed examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero3.json \
+--model_name_or_path t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+请注意，在 DeepSpeed 文档中，您可能会看到 `--deepspeed --deepspeed_config ds_config.json` - 即两个与 DeepSpeed 相关的参数，但为简单起见，并且因为已经有很多参数要处理，我们将两者合并为一个单一参数。
+
+有关一些实际使用示例，请参阅 [此帖](https://github.com/huggingface/transformers/issues/8771#issuecomment-759248400)。
+
+
+
+<a id='deepspeed-one-gpu'></a>
+
+### 单GPU启用
+
+要使用一张 GPU 启用 DeepSpeed，调整 [`Trainer`] 的命令行参数如下：
+
+
+```bash
+deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero2.json \
+--model_name_or_path t5-small --per_device_train_batch_size 1 \
+--output_dir output_dir --overwrite_output_dir --fp16 \
+--do_train --max_train_samples 500 --num_train_epochs 1 \
+--dataset_name wmt16 --dataset_config "ro-en" \
+--source_lang en --target_lang ro
+```
+
+这与多 GPU 的情况几乎相同，但在这里我们通过 `--num_gpus=1` 明确告诉 DeepSpeed 仅使用一张 GPU。默认情况下，DeepSpeed 启用给定节点上可以看到的所有 GPU。如果您一开始只有一张 GPU，那么您不需要这个参数。以下 [文档](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) 讨论了启动器的选项。
+
+为什么要在仅使用一张 GPU 的情况下使用 DeepSpeed 呢？
+
+1. 它具有 ZeRO-offload 功能，可以将一些计算和内存委托给主机的 CPU 和 内存，从而为模型的需求保留更多 GPU 资源 - 例如更大的批处理大小，或启用正常情况下无法容纳的非常大模型。
+2. 它提供了智能的 GPU 内存管理系统，最小化内存碎片，这再次允许您容纳更大的模型和数据批次。
+
+虽然接下来我们将详细讨论配置，但在单个 GPU 上通过 DeepSpeed 实现巨大性能提升的关键是在配置文件中至少有以下配置：
+
+
+```json
+{
+  "zero_optimization": {
+     "stage": 2,
+     "offload_optimizer": {
+         "device": "cpu",
+         "pin_memory": true
+     },
+     "allgather_partitions": true,
+     "allgather_bucket_size": 2e8,
+     "reduce_scatter": true,
+     "reduce_bucket_size": 2e8,
+     "overlap_comm": true,
+     "contiguous_gradients": true
+  }
+}
+```
+
+这会启用`optimizer offload `和一些其他重要功能。您可以尝试不同的buffer大小，有关详细信息，请参见下面的讨论。
+
+关于这种启用类型的实际使用示例，请参阅 [此帖](https://github.com/huggingface/transformers/issues/8771#issuecomment-759176685)。
+
+您还可以尝试使用本文后面进一步解释的支持`CPU 和 NVMe offload`功能的ZeRO-3 。
+
+
+<!--- TODO: Benchmark whether we can get better performance out of ZeRO-3 vs. ZeRO-2 on a single GPU, and then
+recommend ZeRO-3 config as starting one. -->
+
+注意：
+
+- 如果您需要在特定的 GPU 上运行，而不是 GPU 0，则无法使用 `CUDA_VISIBLE_DEVICES` 来限制可用 GPU 的可见范围。相反，您必须使用以下语法：
+  
+  ```bash
+  deepspeed --include localhost:1 examples/pytorch/translation/run_translation.py ...
+  ```
+
+  在这个例子中，我们告诉 DeepSpeed 使用 GPU 1（第二个 GPU）。
+
+
+
+<a id='deepspeed-multi-node'></a>
+
+### 多节点启用
+
+这一部分的信息不仅适用于 DeepSpeed 集成，也适用于任何多节点程序。但 DeepSpeed 提供了一个比其他启动器更易于使用的 `deepspeed` 启动器，除非您在 SLURM 环境中。
+
+在本节，让我们假设您有两个节点，每个节点有 8 张 GPU。您可以通过 `ssh hostname1` 访问第一个节点，通过 `ssh hostname2` 访问第二个节点，两者必须能够在本地通过 ssh 无密码方式相互访问。当然，您需要将这些主机（节点）名称重命名为您实际使用的主机名称。
+
+
+#### torch.distributed.run启动器
+
+
+例如，要使用 `torch.distributed.run`，您可以执行以下操作：
+
+```bash
+python -m torch.distributed.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
+--master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+您必须 ssh 到每个节点，并在每个节点上运行相同的命令！不用担心，启动器会等待两个节点同步完成。
+
+有关更多信息，请参阅 [torchrun](https://pytorch.org/docs/stable/elastic/run.html)。顺便说一下，这也是替代了几个 PyTorch 版本前的 `torch.distributed.launch` 的启动器。
+
+
+#### deepspeed启动器
+
+要改用 `deepspeed` 启动器，首先需要创建一个 `hostfile` 文件：
+
+```
+hostname1 slots=8
+hostname2 slots=8
+```
+然后，您可以这样启动：
+
+```bash
+deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \
+your_program.py <normal cl args> --deepspeed ds_config.json
+```
+
+与 `torch.distributed.run` 启动器不同，`deepspeed` 将自动在两个节点上启动此命令！
+
+更多信息，请参阅[资源配置（多节点）](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node)。
+
+
+#### 在 SLURM 环境中启动
+
+在 SLURM 环境中，可以采用以下方法。以下是一个 SLURM 脚本 `launch.slurm`，您需要根据您的具体 SLURM 环境进行调整。
+
+```bash
+#SBATCH --job-name=test-nodes        # name
+#SBATCH --nodes=2                    # nodes
+#SBATCH --ntasks-per-node=1          # crucial - only 1 task per dist per node!
+#SBATCH --cpus-per-task=10           # number of cores per tasks
+#SBATCH --gres=gpu:8                 # number of gpus
+#SBATCH --time 20:00:00              # maximum execution time (HH:MM:SS)
+#SBATCH --output=%x-%j.out           # output file name
+
+export GPUS_PER_NODE=8
+export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
+export MASTER_PORT=9901
+
+srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
+ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
+ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
+your_program.py <normal cl args> --deepspeed ds_config.json'
+```
+
+剩下的就是运行它：
+
+```bash
+sbatch launch.slurm
+```
+
+`srun` 将负责在所有节点上同时启动程序。
+
+
+#### 使用非共享文件系统
+
+默认情况下，DeepSpeed 假定多节点环境使用共享存储。如果不是这种情况，每个节点只能看到本地文件系统，你需要调整配置文件，包含一个 [`checkpoint` 部分](https://www.deepspeed.ai/docs/config-json/#checkpoint-options)并设置如下选项：
+
+```json
+{
+  "checkpoint": {
+    "use_node_local_storage": true
+  }
+}
+```
+
+或者，你还可以使用 [`Trainer`] 的 `--save_on_each_node` 参数，上述配置将自动添加。
+
+
+<a id='deepspeed-notebook'></a>
+
+### 在Notebooks启用
+
+在将`notebook cells`作为脚本运行的情况下，问题在于没有正常的 `deepspeed` 启动器可依赖，因此在某些设置下，我们必须仿真运行它。
+
+如果您只使用一个 GPU，以下是如何调整notebook中的训练代码以使用 DeepSpeed。
+
+```python
+# DeepSpeed requires a distributed environment even when only one process is used.
+# This emulates a launcher in the notebook
+import os
+
+os.environ["MASTER_ADDR"] = "localhost"
+os.environ["MASTER_PORT"] = "9994"  # modify if RuntimeError: Address already in use
+os.environ["RANK"] = "0"
+os.environ["LOCAL_RANK"] = "0"
+os.environ["WORLD_SIZE"] = "1"
+
+# Now proceed as normal, plus pass the deepspeed config file
+training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
+trainer = Trainer(...)
+trainer.train()
+```
+
+注意：`...` 代表您传递给函数的正常参数。
+
+如果要使用多于一个 GPU，您必须在 DeepSpeed 中使用多进程环境。也就是说，您必须使用专门的启动器来实现这一目的，而不能通过仿真本节开头呈现的分布式环境来完成。
+
+如果想要在notebook中动态创建配置文件并保存在当前目录，您可以在一个专用的cell中使用：
+
+```python no-style
+%%bash
+cat <<'EOT' > ds_config_zero3.json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+EOT
+```
+
+如果训练脚本在一个普通文件中而不是在notebook cells中，您可以通过笔记本中的 shell 正常启动 `deepspeed`。例如，要使用 `run_translation.py`，您可以这样启动：
+
+```python no-style
+!git clone https://github.com/huggingface/transformers
+!cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+或者使用 `%%bash` 魔术命令，您可以编写多行代码，用于运行 shell 程序：
+
+```python no-style
+%%bash
+
+git clone https://github.com/huggingface/transformers
+cd transformers
+deepspeed examples/pytorch/translation/run_translation.py ...
+```
+
+在这种情况下，您不需要本节开头呈现的任何代码。
+
+注意：虽然 `%%bash` 魔术命令很方便，但目前它会缓冲输出，因此在进程完成之前您看不到日志。
+
+
+<a id='deepspeed-config'></a>
+
+### 配置
+
+有关可以在 DeepSpeed 配置文件中使用的完整配置选项的详细指南，请参阅[以下文档](https://www.deepspeed.ai/docs/config-json/)。
+
+您可以在 [DeepSpeedExamples 仓库](https://github.com/microsoft/DeepSpeedExamples)中找到解决各种实际需求的数十个 DeepSpeed 配置示例。
+
+```bash
+git clone https://github.com/microsoft/DeepSpeedExamples
+cd DeepSpeedExamples
+find . -name '*json'
+```
+
+延续上面的代码，假设您要配置 Lamb 优化器。那么您可以通过以下方式在示例的 `.json` 文件中进行搜索：
+
+```bash
+grep -i Lamb $(find . -name '*json')
+```
+
+还可以在[主仓](https://github.com/microsoft/DeepSpeed)中找到更多示例。
+
+在使用 DeepSpeed 时，您总是需要提供一个 DeepSpeed 配置文件，但是一些配置参数必须通过命令行进行配置。您将在本指南的剩余章节找到这些细微差别。
+
+为了了解 DeepSpeed 配置文件，这里有一个激活 ZeRO stage 2 功能的示例，包括优化器状态的 CPU offload，使用 `AdamW` 优化器和 `WarmupLR`  调度器，并且如果传递了 `--fp16` 参数将启用混合精度训练：
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+}
+```
+
+当您执行程序时，DeepSpeed 将把它从 [`Trainer`] 收到的配置日志输出到console，因此您可以看到传递给它的最终配置。
+
+
+
+<a id='deepspeed-config-passing'></a>
+
+### 传递配置
+
+正如本文档讨论的那样，通常将 DeepSpeed 配置作为指向 JSON 文件的路径传递，但如果您没有使用命令行界面配置训练，而是通过 [`TrainingArguments`] 实例化 [`Trainer`]，那么对于 `deepspeed` 参数，你可以传递一个嵌套的 `dict`。这使您能够即时创建配置，而无需在将其传递给 [`TrainingArguments`] 之前将其写入文件系统。
+
+总结起来，您可以这样做：
+
+```python
+TrainingArguments(..., deepspeed="/path/to/ds_config.json")
+```
+
+或者:
+
+```python
+ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
+TrainingArguments(..., deepspeed=ds_config_dict)
+```
+
+<a id='deepspeed-config-shared'></a>
+
+### 共享配置
+
+
+<Tip warning={true}>
+
+这一部分是必读的。
+
+</Tip>
+
+一些配置值对于 [`Trainer`] 和 DeepSpeed 正常运行都是必需的，因此，为了防止定义冲突及导致的难以检测的错误，我们选择通过 [`Trainer`] 命令行参数配置这些值。
+
+此外，一些配置值是基于模型的配置自动派生的，因此，与其记住手动调整多个值，最好让 [`Trainer`] 为您做大部分配置。
+
+因此，在本指南的其余部分，您将找到一个特殊的配置值：`auto`，当设置时将自动将参数替换为正确或最有效的值。请随意选择忽略此建议或显式设置该值，在这种情况下，请务必确保 [`Trainer`] 参数和 DeepSpeed 配置保持一致。例如，您是否使用相同的学习率、批量大小或梯度累积设置？如果这些不匹配，训练可能以非常难以检测的方式失败。请重视该警告。
+
+还有一些参数是仅适用于 DeepSpeed 的，并且这些参数必须手动设置以适应您的需求。
+
+在您自己的程序中，如果您想要作为主动修改 DeepSpeed 配置并以此配置 [`TrainingArguments`]，您还可以使用以下方法。步骤如下：
+
+1. 创建或加载要用作主配置的 DeepSpeed 配置
+2. 根据这些参数值创建 [`TrainingArguments`] 对象
+
+请注意，一些值，比如 `scheduler.params.total_num_steps`，是在 [`Trainer`] 的 `train` 过程中计算的，但当然您也可以自己计算这些值。
+
+
+<a id='deepspeed-zero'></a>
+
+### ZeRO
+
+[Zero Redundancy Optimizer (ZeRO)](https://www.deepspeed.ai/tutorials/zero/) 是 DeepSpeed 的工作核心。它支持3个不同级别（stages）的优化。Stage 1 对于扩展性来说不是很有趣，因此本文档重点关注Stage 2和Stage 3。Stage 3通过最新的 ZeRO-Infinity 进一步改进。你可以在 DeepSpeed 文档中找到更详细的信息。
+
+配置文件的 `zero_optimization` 部分是最重要的部分（[文档](https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training)），因为在这里您定义了要启用哪些 ZeRO stages 以及如何配置它们。您可以在 DeepSpeed 文档中找到每个参数的解释。
+
+这一部分必须通过 DeepSpeed 配置文件单独配置 - [`Trainer`] 不提供相应的命令行参数。
+
+注意：目前 DeepSpeed 不验证参数名称，因此如果您拼错了任何参数，它将使用拼写错误的参数的默认设置。您可以观察 DeepSpeed 引擎启动日志消息，看看它将使用哪些值。
+
+<a id='deepspeed-zero2-config'></a>
+
+#### ZeRO-2 配置
+
+以下是 ZeRO stage 2 的配置示例：
+
+```json
+{
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 5e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 5e8,
+        "contiguous_gradients": true
+    }
+}
+```
+
+**性能调优：**
+
+- 启用 `offload_optimizer` 应该减少 GPU 内存使用（需要 `"stage": 2`）。
+- `"overlap_comm": true` 通过增加 GPU 内存使用来降低all-reduce 的延迟。 `overlap_comm` 使用了 `allgather_bucket_size` 和 `reduce_bucket_size` 值的4.5倍。因此，如果它们设置为 `5e8`，这将需要一个9GB的内存占用（`5e8 x 2Bytes x 2 x 4.5`）。因此，如果您的 GPU 内存为8GB或更小，为了避免出现OOM错误，您需要将这些参数减小到约 `2e8`，这将需要3.6GB。如果您的 GPU 容量更大，当您开始遇到OOM时，你可能也需要这样做。
+- 当减小这些buffers时，您以更慢的通信速度来换取更多的 GPU 内存。buffers大小越小，通信速度越慢，GPU 可用于其他任务的内存就越多。因此，如果更大的批处理大小很重要，那么稍微减慢训练时间可能是一个很好的权衡。
+
+此外，`deepspeed==0.4.4` 添加了一个新选项 `round_robin_gradients`，您可以通过以下方式启用：
+
+```json
+{
+    "zero_optimization": {
+        "round_robin_gradients": true
+    }
+}
+```
+这是一个用于 CPU offloading 的stage 2优化，通过细粒度梯度分区在 ranks 之间并行复制到 CPU 内存，从而实现了性能的提升。性能优势随着梯度累积步骤（在优化器步骤之间进行更多复制）或 GPU 数量（增加并行性）增加而增加。
+
+<a id='deepspeed-zero3-config'></a>
+
+#### ZeRO-3 配置
+
+以下是 ZeRO stage 3的配置示例：
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+如果您因为你的模型或激活值超过 GPU 内存而遇到OOM问题，并且您有未使用的 CPU 内存，可以通股票使用 `"device": "cpu"` 将优化器状态和参数卸载到 CPU 内存中，来解决这个限制。如果您不想卸载到 CPU 内存，可以在 `device` 条目中使用 `none` 代替 `cpu`。将优化器状态卸载到 NVMe 上会在后面进一步讨论。
+
+通过将 `pin_memory` 设置为 `true` 启用固定内存。此功能会以减少可用于其他进程的内存为代价来提高吞吐量。固定内存被分配给特定请求它的进程，通常比普通 CPU 内存访问速度更快。
+
+**性能调优：**
+
+- `stage3_max_live_parameters`: `1e9`
+- `stage3_max_reuse_distance`: `1e9`
+
+如果遇到OOM问题，请减小 `stage3_max_live_parameters` 和 `stage3_max_reuse_distance`。它们对性能的影响应该很小，除非您正在进行激活值checkpointing。`1e9` 大约会消耗 ~2GB。内存由 `stage3_max_live_parameters` 和 `stage3_max_reuse_distance` 共享，所以它不是叠加的，而是总共2GB。
+
+`stage3_max_live_parameters` 是在任何给定时间要在 GPU 上保留多少个完整参数的上限。"reuse distance" 是我们用来确定参数在将来何时会再次使用的度量标准，我们使用 `stage3_max_reuse_distance` 来决定是丢弃参数还是保留参数。如果一个参数在不久的将来（小于 `stage3_max_reuse_distance`）将被再次使用，那么我们将其保留以减少通信开销。这在启用激活值checkpoing时非常有用，其中我们以单层粒度进行前向重计算和反向传播，并希望在反向传播期间保留前向重计算中的参数。
+
+以下配置值取决于模型的隐藏大小：
+
+- `reduce_bucket_size`: `hidden_size*hidden_size`
+- `stage3_prefetch_bucket_size`: `0.9 * hidden_size * hidden_size`
+- `stage3_param_persistence_threshold`: `10 * hidden_size`
+
+因此，将这些值设置为 `auto`，[`Trainer`] 将自动分配推荐的参数值。当然，如果您愿意，也可以显式设置这些值。
+
+`stage3_gather_16bit_weights_on_model_save` 在模型保存时启用模型的 fp16 权重整合。对于大模型和多个 GPU，无论是在内存还是速度方面，这都是一项昂贵的操作。目前如果计划恢复训练，这是必需的。请注意未来的更新可能会删除此限制并让使用更加灵活。
+
+如果您从 ZeRO-2 配置迁移，请注意 `allgather_partitions`、`allgather_bucket_size` 和 `reduce_scatter` 配置参数在 ZeRO-3 中不被使用。如果保留这些配置文件，它们将被忽略。
+
+- `sub_group_size`: `1e9`
+
+`sub_group_size` 控制在优化器步骤期间更新参数的粒度。参数被分组到大小为 `sub_group_size` 的桶中，每个桶逐个更新。在 ZeRO-Infinity 中与 NVMe offload一起使用时，`sub_group_size` 控制了在优化器步骤期间在 NVMe 和 CPU 内存之间移动模型状态的粒度。这可以防止非常大的模型耗尽 CPU 内存。
+
+当不使用 NVMe offload时，可以将 `sub_group_size` 保留为其默认值 *1e9*。在以下情况下，您可能需要更改其默认值：
+
+1. 在优化器步骤中遇到OOM：减小 `sub_group_size` 以减少临时buffers的内存利用
+2. 优化器步骤花费很长时间：增加 `sub_group_size` 以提高由于增加的数据buffers而导致的带宽利用率。
+
+
+#### ZeRO-0 配置
+
+请注意，我们将 Stage 0 和 1 放在最后，因为它们很少使用。
+
+Stage 0 禁用了所有类型的分片，只是将 DeepSpeed 作为 DDP 使用。您可以通过以下方式启用：
+
+```json
+{
+    "zero_optimization": {
+        "stage": 0
+    }
+}
+```
+
+这将实质上禁用 ZeRO，而无需更改其他任何内容。
+
+
+#### ZeRO-1 配置
+
+
+Stage 1 等同于 Stage 2 减去梯度分片。您可以尝试使用以下配置，仅对优化器状态进行分片，以稍微加速：
+
+
+```json
+{
+    "zero_optimization": {
+        "stage": 1
+    }
+}
+```
+
+
+
+<a id='deepspeed-nvme'></a>
+
+### NVMe 支持
+
+ZeRO-Infinity 通过使用 NVMe 内存扩展 GPU 和 CPU 内存，从而允许训练非常大的模型。由于智能分区和平铺算法，在offload期间每个 GPU 需要发送和接收非常小量的数据，因此 NVMe 被证明适用于训练过程中提供更大的总内存池。ZeRO-Infinity 需要启用 ZeRO-3。
+
+以下配置示例启用 NVMe 来offload优化器状态和参数：
+
+```json
+{
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 4,
+            "fast_init": false
+        },
+        "offload_param": {
+            "device": "nvme",
+            "nvme_path": "/local_nvme",
+            "pin_memory": true,
+            "buffer_count": 5,
+            "buffer_size": 1e8,
+            "max_in_cpu": 1e9
+        },
+        "aio": {
+            "block_size": 262144,
+            "queue_depth": 32,
+            "thread_count": 1,
+            "single_submit": false,
+            "overlap_events": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+}
+```
+
+您可以选择将优化器状态和参数都卸载到 NVMe，也可以只选择其中一个，或者都不选择。例如，如果您有大量的 CPU 内存可用，只卸载到 CPU 内存训练速度会更快（提示："device": "cpu"）。
+
+这是有关卸载 [优化器状态](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) 和 [参数](https://www.deepspeed.ai/docs/config-json/#parameter-offloading) 的完整文档。
+
+确保您的 `nvme_path` 实际上是一个 NVMe，因为它与普通硬盘或 SSD 一起工作，但速度会慢得多。快速可扩展的训练是根据现代 NVMe 传输速度设计的（截至本文撰写时，可以达到 ~3.5GB/s 读取，~3GB/s 写入的峰值速度）。
+
+为了找出最佳的 `aio` 配置块，您必须在目标设置上运行一个基准测试，具体操作请参见[说明](https://github.com/microsoft/DeepSpeed/issues/998)。
+
+
+
+<a id='deepspeed-zero2-zero3-performance'></a>
+
+#### ZeRO-2 和 ZeRO-3 性能对比
+
+如果其他一切都配置相同，ZeRO-3 可能比 ZeRO-2 慢，因为前者除了 ZeRO-2 的操作外，还必须收集模型权重。如果 ZeRO-2 满足您的需求，而且您不需要扩展到几个 GPU 以上，那么您可以选择继续使用它。重要的是要理解，ZeRO-3 以速度为代价实现了更高的可扩展性。
+
+可以调整 ZeRO-3 配置使其性能接近 ZeRO-2：
+
+- 将 `stage3_param_persistence_threshold` 设置为一个非常大的数字 - 大于最大的参数，例如 `6 * hidden_size * hidden_size`。这将保留参数在 GPU 上。
+- 关闭 `offload_params`，因为 ZeRO-2 没有这个选项。
+
+即使不更改 `stage3_param_persistence_threshold`，仅将 `offload_params` 关闭，性能可能会显著提高。当然，这些更改将影响您可以训练的模型的大小。因此，这些更改可根据需求帮助您在可扩展性和速度之间进行权衡。
+
+
+
+<a id='deepspeed-zero2-example'></a>
+
+#### ZeRO-2 示例
+
+这是一个完整的 ZeRO-2 自动配置文件 `ds_config_zero2.json`：
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+```
+
+这是一个完整的手动设置的启用所有功能的 ZeRO-2 配置文件。主要是为了让您看到典型的参数值是什么样的，但我们强烈建议使用其中包含多个 `auto` 设置的配置文件。
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 3e-5,
+            "betas": [0.8, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 500
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 2,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "allgather_partitions": true,
+        "allgather_bucket_size": 2e8,
+        "overlap_comm": true,
+        "reduce_scatter": true,
+        "reduce_bucket_size": 2e8,
+        "contiguous_gradients": true
+    },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+```
+
+<a id='deepspeed-zero3-example'></a>
+
+#### ZeRO-3 示例
+
+这是一个完整的 ZeRO-3 自动配置文件 `ds_config_zero3.json`：
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": "auto",
+            "betas": "auto",
+            "eps": "auto",
+            "weight_decay": "auto"
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": "auto",
+            "warmup_max_lr": "auto",
+            "warmup_num_steps": "auto"
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": "auto",
+        "stage3_prefetch_bucket_size": "auto",
+        "stage3_param_persistence_threshold": "auto",
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "gradient_accumulation_steps": "auto",
+    "gradient_clipping": "auto",
+    "steps_per_print": 2000,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "wall_clock_breakdown": false
+}
+```
+
+这是一个完整的 手动设置的启用所有功能的ZeRO-3 配置文件。主要是为了让您看到典型的参数值是什么样的，但我们强烈建议使用其中包含多个 `auto` 设置的配置文件。
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    },
+
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 3e-5,
+            "betas": [0.8, 0.999],
+            "eps": 1e-8,
+            "weight_decay": 3e-7
+        }
+    },
+
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 3e-5,
+            "warmup_num_steps": 500
+        }
+    },
+
+    "zero_optimization": {
+        "stage": 3,
+        "offload_optimizer": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": true
+        },
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "sub_group_size": 1e9,
+        "reduce_bucket_size": 1e6,
+        "stage3_prefetch_bucket_size": 0.94e6,
+        "stage3_param_persistence_threshold": 1e4,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+
+    "steps_per_print": 2000,
+    "wall_clock_breakdown": false
+}
+```
+
+#### 如何选择最佳性能的ZeRO Stage和 offloads
+
+了解了这些不同stages后，现在您需要决定使用哪个stage。本节将尝试回答这个问题。
+
+通常，以下规则适用：
+
+- 速度方面（左边比右边快）
+
+  stage 0（DDP） > stage 1 > stage 2 > stage 2 + offload  > stage 3 > stage3 + offload
+
+- GPU内存使用方面（右边比左边更节省GPU内存）
+
+  stage 0（DDP） < stage 1 < stage 2 < stage 2 + offload < stage 3 < stage 3 + offload
+
+所以，当您希望在尽量使用较少数量的GPU的同时获得最快的执行速度时，可以按照以下步骤进行。我们从最快的方法开始，如果遇到GPU内存溢出，然后切换到下一个速度较慢但使用的GPU内存更少的方法。以此类推。
+
+首先，将批量大小设置为1（您始终可以使用梯度累积来获得任何所需的有效批量大小）。
+
+
+1. 启用 `--gradient_checkpointing 1`（HF Trainer）或直接 `model.gradient_checkpointing_enable()` - 如果发生OOM（Out of Memory），则执行以下步骤。
+2. 首先尝试 ZeRO stage 2。如果发生OOM，则执行以下步骤。
+3. 尝试 ZeRO stage 2 + `offload_optimizer` - 如果发生OOM，则执行以下步骤。
+4. 切换到 ZeRO stage 3 - 如果发生OOM，则执行以下步骤。
+5. 启用 `offload_param` 到 `cpu` - 如果发生OOM，则执行以下步骤。
+6. 启用 `offload_optimizer` 到 `cpu` - 如果发生OOM，则执行以下步骤。
+7. 如果仍然无法适应批量大小为1，请首先检查各种默认值并尽可能降低它们。例如，如果使用 `generate` 并且不使用宽搜索束，将其缩小，因为它会占用大量内存。
+8. 绝对要使用混合半精度而非fp32 - 在Ampere及更高的GPU上使用bf16，在旧的GPU体系结构上使用fp16。
+9. 如果仍然发生OOM，可以添加更多硬件或启用ZeRO-Infinity - 即切换 `offload_param` 和 `offload_optimizer` 到 `nvme`。您需要确保它是非常快的NVMe。作为趣闻，我曾经能够在一个小型GPU上使用BLOOM-176B进行推理，使用了ZeRO-Infinity，尽管速度非常慢。但它奏效了！
+
+当然，您也可以按相反的顺序进行这些步骤，从最节省GPU内存的配置开始，然后逐步反向进行，或者尝试进行二分法。
+
+一旦您的批量大小为1不会导致OOM，就测量您的有效吞吐量。
+
+接下来尝试将批量大小增加到尽可能大，因为批量大小越大，GPU的效率越高，特别是在它们乘法运算的矩阵很大时。
+
+现在性能优化游戏开始了。您可以关闭一些offload特性，或者降低ZeRO stage，并增加/减少批量大小，再次测量有效吞吐量。反复尝试，直到满意为止。
+
+不要花费太多时间，但如果您即将开始一个为期3个月的训练 - 请花几天时间找到吞吐量方面最有效的设置。这样您的训练成本将最低，而且您会更快地完成训练。在当前快节奏的机器学习世界中，如果您花费一个额外的月份来训练某样东西，你很可能会错过一个黄金机会。当然，这只是我分享的一种观察，我并不是在催促你。在开始训练BLOOM-176B之前，我花了2天时间进行这个过程，成功将吞吐量从90 TFLOPs提高到150 TFLOPs！这一努力为我们节省了一个多月的训练时间。
+
+这些注释主要是为训练模式编写的，但它们在推理中也应该大部分适用。例如，在推理中，Gradient Checkpointing 是无用的，因为它只在训练过程中有用。此外，我们发现，如果你正在进行多GPU推理并且不使用 [DeepSpeed-Inference](https://www.deepspeed.ai/tutorials/inference-tutorial/)，[Accelerate](https://huggingface.co/blog/bloom-inference-pytorch-scripts) 应该提供更优越的性能。
+
+其他与性能相关的快速注释：
+- 如果您从头开始训练某个模型，请尽量确保张量的形状可以被16整除（例如隐藏层大小）。对于批量大小，至少尝试可被2整除。如果您想从GPU中挤取更高性能，还有一些硬件特定的[wave和tile量化](https://developer.nvidia.com/blog/optimizing-gpu-performance-tensor-cores/)的可整除性。
+
+
+
+### Activation Checkpointing 或 Gradient Checkpointing
+
+Activation Checkpointing和Gradient Checkpointing是指相同方法的两个不同术语。这确实让人感到困惑，但事实就是这样。
+
+Gradient Checkpointing允许通过牺牲速度来换取GPU内存，这要么使您能够克服GPU内存溢出，要么增加批量大小来获得更好的性能。
+
+HF Transformers 模型对DeepSpeed的Activation Checkpointing一无所知，因此如果尝试在DeepSpeed配置文件中启用该功能，什么都不会发生。
+
+因此，您有两种方法可以利用这个非常有益的功能：
+
+1. 如果您想使用 HF Transformers 模型，你可以使用 `model.gradient_checkpointing_enable()` 或在 HF Trainer 中使用 `--gradient_checkpointing`，它会自动为您启用这个功能。在这里使用了 `torch.utils.checkpoint`。
+2. 如果您编写自己的模型并希望使用DeepSpeed的Activation Checkpointing，可以使用[规定的API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html)。您还可以使用 HF Transformers 的模型代码，将 `torch.utils.checkpoint` 替换为 DeepSpeed 的API。后者更灵活，因为它允许您将前向激活值卸载到CPU内存，而不是重新计算它们。
+
+
+### Optimizer 和 Scheduler
+
+只要你不启用 `offload_optimizer`，您可以混合使用DeepSpeed和HuggingFace的调度器和优化器，但有一个例外，即不要使用HuggingFace调度器和DeepSpeed优化器的组合：
+
+
+| Combos       | HF Scheduler | DS Scheduler |
+|:-------------|:-------------|:-------------|
+| HF Optimizer | Yes          | Yes          |
+| DS Optimizer | No           | Yes          |
+
+在启用 `offload_optimizer` 的情况下，可以使用非DeepSpeed优化器，只要该优化器具有CPU和GPU的实现（除了LAMB）。
+
+<a id='deepspeed-optimizer'></a>
+
+#### Optimizer
+
+DeepSpeed的主要优化器包括Adam、AdamW、OneBitAdam和Lamb。这些优化器已经与ZeRO进行了彻底的测试，因此建议使用它们。然而，也可以导入`torch`中的其他优化器。完整的文档在[这里](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters)。
+
+如果在配置文件中不配置`optimizer`条目，[`Trainer`] 将自动将其设置为 `AdamW`，并使用提供的值或以下命令行参数的默认值：`--learning_rate`、`--adam_beta1`、`--adam_beta2`、`--adam_epsilon` 和 `--weight_decay`。
+
+以下是`AdamW` 的自动配置示例：
+
+```json
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": "auto",
+         "betas": "auto",
+         "eps": "auto",
+         "weight_decay": "auto"
+       }
+   }
+}
+```
+
+请注意，命令行参数将设置配置文件中的值。这是为了有一个明确的值来源，并避免在不同地方设置学习率等值时难以找到的错误。命令行参数配置高于其他。被覆盖的值包括：
+
+- `lr` 的值为 `--learning_rate`
+- `betas` 的值为 `--adam_beta1 --adam_beta2`
+- `eps` 的值为 `--adam_epsilon`
+- `weight_decay` 的值为 `--weight_decay`
+
+因此，请记住在命令行上调整共享的超参数。
+
+您也可以显式地设置这些值：
+
+```json
+{
+   "optimizer": {
+       "type": "AdamW",
+       "params": {
+         "lr": 0.001,
+         "betas": [0.8, 0.999],
+         "eps": 1e-8,
+         "weight_decay": 3e-7
+       }
+   }
+}
+```
+
+但在这种情况下，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+如果您想使用上面未列出的其他优化器，您将不得不将其添加到顶层配置中。
+
+```json
+{
+   "zero_allow_untested_optimizer": true
+}
+```
+
+类似于 `AdamW`，您可以配置其他官方支持的优化器。只是记住这些可能有不同的配置值。例如，对于Adam，您可能需要将 `weight_decay` 设置在 `0.01` 左右。
+
+此外，当与DeepSpeed的CPU Adam优化器一起使用时，offload的效果最好。如果您想在offload时使用不同的优化器，自 `deepspeed==0.8.3` 起，您还需要添加：
+
+
+```json
+{
+   "zero_force_ds_cpu_optimizer": false
+}
+```
+到顶层配置中。
+
+
+
+<a id='deepspeed-scheduler'></a>
+
+#### Scheduler
+
+DeepSpeed支持`LRRangeTest`、`OneCycle`、`WarmupLR`和`WarmupDecayLR`学习率调度器。完整文档在[这里](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters)。
+
+以下是🤗 Transformers 和 DeepSpeed 之间的调度器重叠部分：
+
+- 通过 `--lr_scheduler_type constant_with_warmup` 实现 `WarmupLR`
+- 通过 `--lr_scheduler_type linear` 实现 `WarmupDecayLR`。这也是 `--lr_scheduler_type` 的默认值，因此，如果不配置调度器，这将是默认配置的调度器。
+
+如果在配置文件中不配置 `scheduler` 条目，[`Trainer`] 将使用 `--lr_scheduler_type`、`--learning_rate` 和 `--warmup_steps` 或 `--warmup_ratio` 的值来配置其🤗 Transformers 版本。
+
+以下是 `WarmupLR` 的自动配置示例：
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
+}
+```
+
+由于使用了 *"auto"*，[`Trainer`] 的参数将在配置文件中设置正确的值。这是为了有一个明确的值来源，并避免在不同地方设置学习率等值时难以找到的错误。命令行配置高于其他。被设置的值包括：
+
+- `warmup_min_lr` 的值为 `0`。
+- `warmup_max_lr` 的值为 `--learning_rate`。
+- `warmup_num_steps` 的值为 `--warmup_steps`（如果提供）。否则，将使用 `--warmup_ratio` 乘以训练步骤的数量，并四舍五入。
+- `total_num_steps` 的值为 `--max_steps` 或者如果没有提供，将在运行时根据环境、数据集的大小和其他命令行参数（对于 `WarmupDecayLR` 来说需要）自动推导。
+
+当然，您可以接管任何或所有的配置值，并自行设置这些值：
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+             "warmup_min_lr": 0,
+             "warmup_max_lr": 0.001,
+             "warmup_num_steps": 1000
+         }
+     }
+}
+```
+
+但在这种情况下，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+例如，对于 `WarmupDecayLR`，您可以使用以下条目：
+
+```json
+{
+   "scheduler": {
+         "type": "WarmupDecayLR",
+         "params": {
+             "last_batch_iteration": -1,
+             "total_num_steps": "auto",
+             "warmup_min_lr": "auto",
+             "warmup_max_lr": "auto",
+             "warmup_num_steps": "auto"
+         }
+     }
+}
+```
+
+然后，`total_num_steps`、`warmup_max_lr`、`warmup_num_steps` 和 `total_num_steps` 将在加载时设置。
+
+
+<a id='deepspeed-fp32'></a>
+
+### fp32精度
+
+DeepSpeed支持完整的fp32和fp16混合精度。
+
+由于fp16混合精度具有更小的内存需求和更快的速度，唯一不使用它的时候是当您使用的模型在这种训练模式下表现不佳时。通常，当模型没有在fp16混合精度下进行预训练时（例如，bf16预训练模型经常出现这种情况），会出现这种情况。这样的模型可能会发生溢出或下溢，导致 `NaN` 损失。如果是这种情况，那么您将希望使用完整的fp32模式，通过显式禁用默认启用的fp16混合精度模式：
+
+```json
+{
+    "fp16": {
+        "enabled": false,
+    }
+}
+```
+
+如果您使用基于Ampere架构的GPU，PyTorch版本1.7及更高版本将自动切换到使用更高效的tf32格式进行一些操作，但结果仍将以fp32格式呈现。有关详细信息和基准测试，请参见[TensorFloat-32(TF32) on Ampere devices](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices)。如果出于某种原因您不希望使用它，该文档包括有关如何禁用此自动转换的说明。
+
+在🤗 Trainer中，你可以使用 `--tf32` 来启用它，或使用 `--tf32 0` 或 `--no_tf32` 来禁用它。默认情况下，使用PyTorch的默认设置。
+
+
+
+<a id='deepspeed-amp'></a>
+
+### 自动混合精度
+
+您可以使用自动混合精度，可以选择使用类似 PyTorch AMP 的方式，也可以选择使用类似 Apex 的方式：
+
+### fp16
+
+要配置PyTorch AMP-like 的 fp16（float16） 模式，请设置：
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+并且，[`Trainer`]将根据`args.fp16_backend`的值自动启用或禁用它。其余的配置值由您决定。
+
+当传递`--fp16 --fp16_backend amp`或`--fp16_full_eval`命令行参数时，此模式将被启用。
+
+您也可以显式地启用/禁用此模式：
+
+```json
+{
+    "fp16": {
+        "enabled": true,
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+但是之后您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+以下是[相关文档](https://www.deepspeed.ai/docs/config-json/#fp16-training-options)
+
+
+### bf16
+
+如果需要使用bfloat16而不是fp16，那么可以使用以下配置部分：
+
+```json
+{
+    "bf16": {
+        "enabled": "auto"
+    }
+}
+```
+
+bf16具有与fp32相同的动态范围，因此不需要损失缩放。
+
+当传递`--bf16`或`--bf16_full_eval`命令行参数时，启用此模式。
+
+您还可以显式地启用/禁用此模式：
+
+```json
+{
+    "bf16": {
+        "enabled": true
+    }
+}
+```
+
+<Tip>
+
+在`deepspeed==0.6.0`版本中，bf16支持是新的实验性功能。
+
+如果您启用了bf16来进行[梯度累积](#gradient-accumulation)，您需要意识到它会以bf16累积梯度，这可能不是您想要的，因为这种格式的低精度可能会导致lossy accumulation。
+
+修复这个问题的工作正在努力进行，同时提供了使用更高精度的`dtype`（fp16或fp32）的选项。
+
+</Tip>
+
+
+### NCCL集合
+
+在训练过程中，有两种数据类型：`dtype`和用于通信收集操作的`dtype`，如各种归约和收集/分散操作。
+
+所有的gather/scatter操作都是在数据相同的`dtype`中执行的，所以如果您正在使用bf16的训练模式，那么它将在bf16中进行gather操作 - gather操作是非损失性的。
+
+各种reduce操作可能会是非常损失性的，例如当梯度在多个gpu上平均时，如果通信是在fp16或bf16中进行的，那么结果可能是有损失性的 - 因为当在一个低精度中添加多个数字时，结果可能不是精确的。更糟糕的是，bf16比fp16具有更低的精度。通常，当平均梯度时，损失最小，这些梯度通常非常小。因此，对于半精度训练，默认情况下，fp16被用作reduction操作的默认值。但是，您可以完全控制这个功能，如果你选择的话，您可以添加一个小的开销，并确保reductions将使用fp32作为累积数据类型，只有当结果准备好时，它才会降级到您在训练中使用的半精度`dtype`。
+
+要覆盖默认设置，您只需添加一个新的配置条目：
+
+```json
+{
+    "communication_data_type": "fp32"
+}
+```
+
+根据这个信息，有效的值包括"fp16"、"bfp16"和"fp32"。
+
+注意：在stage zero 3中，bf16通信数据类型存在一个bug，该问题已在`deepspeed==0.8.1`版本中得到修复。
+
+
+### apex
+
+配置apex AMP-like模式：
+
+```json
+"amp": {
+    "enabled": "auto",
+    "opt_level": "auto"
+}
+```
+
+并且，[`Trainer`]将根据`args.fp16_backend`和`args.fp16_opt_level`的值自动配置它。
+
+当传递`--fp16 --fp16_backend apex --fp16_opt_level 01`命令行参数时，此模式将被启用。
+
+您还可以显式配置此模式：
+
+```json
+{
+    "amp": {
+        "enabled": true,
+        "opt_level": "O1"
+    }
+}
+```
+
+但是，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+这里是[文档](https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options)
+
+
+<a id='deepspeed-bs'></a>
+
+### Batch Size
+
+配置batch size可以使用如下参数:
+
+```json
+{
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto"
+}
+```
+
+并且，[`Trainer`]将自动将`train_micro_batch_size_per_gpu`设置为`args.per_device_train_batch_size`的值，并将`train_batch_size`设置为`args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps`。
+
+您也可以显式设置这些值：
+
+```json
+{
+    "train_batch_size": 12,
+    "train_micro_batch_size_per_gpu": 4
+}
+```
+
+但是，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+
+<a id='deepspeed-grad-acc'></a>
+
+### Gradient Accumulation
+
+配置gradient accumulation设置如下:
+
+```json
+{
+    "gradient_accumulation_steps": "auto"
+}
+```
+
+并且，[`Trainer`]将自动将其设置为`args.gradient_accumulation_steps`的值。
+
+您也可以显式设置这个值：
+
+```json
+{
+    "gradient_accumulation_steps": 3
+}
+```
+
+但是，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+
+<a id='deepspeed-grad-clip'></a>
+
+### Gradient Clipping
+
+配置gradient clipping如下:
+
+```json
+{
+    "gradient_clipping": "auto"
+}
+```
+
+并且，[`Trainer`]将自动将其设置为`args.max_grad_norm`的值。
+
+您也可以显式设置这个值：
+
+```json
+{
+    "gradient_clipping": 1.0
+}
+```
+
+但是，您需要自己同步[`Trainer`]命令行参数和DeepSpeed配置。
+
+
+
+<a id='deepspeed-weight-extraction'></a>
+
+### 获取模型权重
+
+只要您继续使用DeepSpeed进行训练和恢复，您就不需要担心任何事情。DeepSpeed在其自定义检查点优化器文件中存储fp32主权重，这些文件是`global_step*/*optim_states.pt`（这是glob模式），并保存在正常的checkpoint下。
+
+**FP16权重：**
+
+当模型保存在ZeRO-2下时，您最终会得到一个包含模型权重的普通`pytorch_model.bin`文件，但它们只是权重的fp16版本。
+
+在ZeRO-3下，事情要复杂得多，因为模型权重分布在多个GPU上，因此需要`"stage3_gather_16bit_weights_on_model_save": true`才能让`Trainer`保存fp16版本的权重。如果这个设置是`False`，`pytorch_model.bin`将不会被创建。这是因为默认情况下，DeepSpeed的`state_dict`包含一个占位符而不是实际的权重。如果我们保存这个`state_dict`，就无法再加载它了。
+
+
+```json
+{
+    "zero_optimization": {
+        "stage3_gather_16bit_weights_on_model_save": true
+    }
+}
+```
+
+**FP32权重：**
+
+虽然fp16权重适合恢复训练，但如果您完成了模型的微调并希望将其上传到[models hub](https://huggingface.co/models)或传递给其他人，您很可能想要获取fp32权重。这最好不要在训练期间完成，因为这需要大量内存，因此最好在训练完成后离线进行。但是，如果需要并且有充足的空闲CPU内存，可以在相同的训练脚本中完成。以下部分将讨论这两种方法。
+
+**实时FP32权重恢复：**
+
+如果您的模型很大，并且在训练结束时几乎没有剩余的空闲CPU内存，这种方法可能不起作用。
+
+如果您至少保存了一个检查点，并且想要使用最新的一个，可以按照以下步骤操作：
+
+```python
+from transformers.trainer_utils import get_last_checkpoint
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+如果您在使用`--load_best_model_at_end`类：*~transformers.TrainingArguments*参数（用于跟踪最佳
+检查点），那么你可以首先显式地保存最终模型，然后再执行相同的操作：
+
+```python
+from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+
+checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+trainer.deepspeed.save_checkpoint(checkpoint_dir)
+fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+```
+
+<Tip>
+
+注意，一旦运行了`load_state_dict_from_zero_checkpoint`，该模型将不再可以在相同的应用程序的DeepSpeed上下文中使用。也就是说，您需要重新初始化deepspeed引擎，因为`model.load_state_dict(state_dict)`会从其中移除所有的DeepSpeed相关点。所以您只能训练结束时这样做。
+
+</Tip>
+
+当然，您不必使用类：*~transformers.Trainer*，您可以根据你的需求调整上面的示例。
+
+如果您出于某种原因想要更多的优化，您也可以提取权重的fp32 `state_dict`并按照以下示例进行操作：
+
+```python
+from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+
+state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)  # already on cpu
+model = model.cpu()
+model.load_state_dict(state_dict)
+```
+
+**离线FP32权重恢复：**
+
+DeepSpeed会创建一个特殊的转换脚本`zero_to_fp32.py`，并将其放置在checkpoint文件夹的顶层。使用此脚本，您可以在任何时候提取权重。该脚本是独立的，您不再需要配置文件或`Trainer`来执行提取操作。
+
+假设您的checkpoint文件夹如下所示：
+
+```bash
+$ ls -l output_dir/checkpoint-1/
+-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
+drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
+-rw-rw-r-- 1 stas stas   12 Mar 27 13:16 latest
+-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
+-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
+-rw-rw-r-- 1 stas stas  623 Mar 27 20:42 scheduler.pt
+-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
+-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
+-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
+-rw-rw-r-- 1 stas stas  339 Mar 27 20:42 trainer_state.json
+-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
+-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
+```
+
+在这个例子中，只有一个DeepSpeed检查点子文件夹*global_step1*。因此，要重构fp32权重，只需运行：
+
+```bash
+python zero_to_fp32.py . pytorch_model.bin
+```
+
+这就是它。`pytorch_model.bin`现在将包含从多个GPUs合并的完整的fp32模型权重。
+
+该脚本将自动能够处理ZeRO-2或ZeRO-3 checkpoint。
+
+`python zero_to_fp32.py -h`将为您提供使用细节。
+
+该脚本将通过文件`latest`的内容自动发现deepspeed子文件夹，在当前示例中，它将包含`global_step1`。
+
+注意：目前该脚本需要2倍于最终fp32模型权重的通用内存。
+
+
+### ZeRO-3 和 Infinity Nuances
+
+ZeRO-3与ZeRO-2有很大的不同，主要是因为它的参数分片功能。
+
+ZeRO-Infinity进一步扩展了ZeRO-3，以支持NVMe内存和其他速度和可扩展性改进。
+
+尽管所有努力都是为了在不需要对模型进行任何特殊更改的情况下就能正常运行，但在某些情况下，您可能需要以下信息。
+
+
+#### 构建大模型
+
+DeepSpeed/ZeRO-3可以处理参数量达到数万亿的模型，这些模型可能无法适应现有的内存。在这种情况下，如果您还是希望初始化更快地发生，可以使用*deepspeed.zero.Init()*上下文管理器（也是一个函数装饰器）来初始化模型，如下所示：
+
+```python
+from transformers import T5ForConditionalGeneration, T5Config
+import deepspeed
+
+with deepspeed.zero.Init():
+    config = T5Config.from_pretrained("t5-small")
+    model = T5ForConditionalGeneration(config)
+```
+
+如您所见，这会为您随机初始化一个模型。
+
+如果您想使用预训练模型，`model_class.from_pretrained`将在`is_deepspeed_zero3_enabled()`返回`True`的情况下激活此功能，目前这是通过传递的DeepSpeed配置文件中的ZeRO-3配置部分设置的。因此，在调用`from_pretrained`之前，您必须创建**TrainingArguments**对象。以下是可能的顺序示例：
+
+```python
+from transformers import AutoModel, Trainer, TrainingArguments
+
+training_args = TrainingArguments(..., deepspeed=ds_config)
+model = AutoModel.from_pretrained("t5-small")
+trainer = Trainer(model=model, args=training_args, ...)
+```
+
+如果您使用的是官方示例脚本，并且命令行参数中包含`--deepspeed ds_config.json`且启用了ZeRO-3配置，那么一切都已经为您准备好了，因为这是示例脚本的编写方式。
+
+注意：如果模型的fp16权重无法适应单个GPU的内存，则必须使用此功能。
+
+有关此方法和其他相关功能的完整详细信息，请参阅[构建大模型](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models)。
+
+此外，在加载fp16预训练模型时，您希望`from_pretrained`使用`torch_dtype=torch.float16`。详情请参见[from_pretrained-torch-dtype](#from_pretrained-torch-dtype)。
+
+
+#### 参数收集
+
+在多个GPU上使用ZeRO-3时，没有一个GPU拥有所有参数，除非它是当前执行层的参数。因此，如果您需要一次访问所有层的所有参数，有一个特定的方法可以实现。
+您可能不需要它，但如果您需要，请参考[参数收集](https://deepspeed.readthedocs.io/en/latest/zero3.html#manual-parameter-coordination)。
+
+然而，我们在多个地方确实使用了它，其中一个例子是在`from_pretrained`中加载预训练模型权重。我们一次加载一层，然后立即将其分区到所有参与的GPU上，因为对于非常大的模型，无法在一个GPU上一次性加载并将其分布到多个GPU上，因为内存限制。
+
+此外，在ZeRO-3下，如果您编写自己的代码并遇到看起来像这样的模型参数权重：
+
+```python
+tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
+```
+
+强调`tensor([1.])`，或者如果您遇到一个错误，它说参数的大小是`1`，而不是某个更大的多维形状，这意味着参数被划分了，你看到的是一个ZeRO-3占位符。
+
+
+
+<a id='deepspeed-zero-inference'></a>
+
+
+### ZeRO 推理
+
+"ZeRO 推断" 使用与 "ZeRO-3 训练" 相同的配置。您只需要去掉优化器和调度器部分。实际上，如果您希望与训练共享相同的配置文件，您可以将它们保留在配置文件中，它们只会被忽略。
+
+您只需要传递通常的[`TrainingArguments`]参数。例如：
+
+```bash
+deepspeed --num_gpus=2 your_program.py <normal cl args> --do_eval --deepspeed ds_config.json
+```
+
+唯一的重要事情是您需要使用ZeRO-3配置，因为ZeRO-2对于推理没有任何优势，因为只有ZeRO-3才对参数进行分片，而ZeRO-1则对梯度和优化器状态进行分片。
+
+以下是在DeepSpeed下运行`run_translation.py`启用所有可用GPU的示例：
+
+```bash
+deepspeed examples/pytorch/translation/run_translation.py \
+--deepspeed tests/deepspeed/ds_config_zero3.json \
+--model_name_or_path t5-small --output_dir output_dir \
+--do_eval --max_eval_samples 50 --warmup_steps 50  \
+--max_source_length 128 --val_max_target_length 128 \
+--overwrite_output_dir --per_device_eval_batch_size 4 \
+--predict_with_generate --dataset_config "ro-en" --fp16 \
+--source_lang en --target_lang ro --dataset_name wmt16 \
+--source_prefix "translate English to Romanian: "
+```
+
+由于在推理阶段，优化器状态和梯度不需要额外的大量内存，您应该能够将更大的批次和/或序列长度放到相同的硬件上。
+
+此外，DeepSpeed目前正在开发一个名为Deepspeed-Inference的相关产品，它与ZeRO技术无关，而是使用张量并行来扩展无法适应单个GPU的模型。这是一个正在进行的工作，一旦该产品完成，我们将提供集成。
+
+
+### 内存要求
+
+由于 DeepSpeed ZeRO 可以将内存卸载到 CPU（和 NVMe），该框架提供了一些工具，允许根据使用的 GPU 数量告知将需要多少 CPU 和 GPU 内存。
+
+让我们估计在单个GPU上微调"bigscience/T0_3B"所需的内存：
+
+```bash
+$ python -c 'from transformers import AutoModel; \
+from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
+model = AutoModel.from_pretrained("bigscience/T0_3B"); \
+estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'
+[...]
+Estimated memory needed for params, optim states and gradients for a:
+HW: Setup with 1 node, 1 GPU per node.
+SW: Model with 2783M total params, 65M largest layer params.
+  per CPU  |  per GPU |   Options
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+   62.23GB |   5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+    0.37GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=1
+   15.56GB |  46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
+```
+
+因此，您可以将模型拟合在单个80GB的GPU上，不进行CPU offload，或者使用微小的8GB GPU，但需要约60GB的CPU内存。（请注意，这仅是参数、优化器状态和梯度所需的内存 - 您还需要为CUDA内核、激活值和临时变量分配更多的内存。）
+
+然后，这是成本与速度的权衡。购买/租用较小的 GPU（或较少的 GPU，因为您可以使用多个 GPU 进行 Deepspeed ZeRO）。但这样会更慢，因此即使您不关心完成某项任务的速度，减速也直接影响 GPU 使用的持续时间，从而导致更大的成本。因此，请进行实验并比较哪种方法效果最好。
+
+如果您有足够的GPU内存，请确保禁用CPU/NVMe卸载，因为这会使所有操作更快。
+
+例如，让我们重复相同的操作，使用2个GPU：
+
+```bash
+$ python -c 'from transformers import AutoModel; \
+from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
+model = AutoModel.from_pretrained("bigscience/T0_3B"); \
+estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=2, num_nodes=1)'
+[...]
+Estimated memory needed for params, optim states and gradients for a:
+HW: Setup with 1 node, 2 GPUs per node.
+SW: Model with 2783M total params, 65M largest layer params.
+  per CPU  |  per GPU |   Options
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
+   70.00GB |   0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
+   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=1
+   62.23GB |   2.84GB | offload_param=none, offload_optimizer=cpu , zero_init=0
+    0.74GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=1
+   31.11GB |  23.58GB | offload_param=none, offload_optimizer=none, zero_init=0
+
+```
+
+所以，您需要2个32GB或更高的GPU，且不进行CPU卸载。
+
+如需了解更多信息，请参阅[内存估算器](https://deepspeed.readthedocs.io/en/latest/memory.html)。
+
+
+
+### 归档Issues
+
+请按照以下步骤提交问题，以便我们能够迅速找到问题并帮助您解除工作阻塞。
+
+在您的报告中，请始终包括以下内容：
+
+1. 完整的Deepspeed配置文件
+2. 如果使用了[`Trainer`]，则包括命令行参数；如果自己编写了Trainer设置，则包括[`TrainingArguments`]参数。请不要导出[`TrainingArguments`]，因为它有几十个与问题无关的条目。
+3. 输出：
+
+    ```bash
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+    ```
+
+4. 如果可能，请包含一个Google Colab notebook链接，我们可以使用它来重现问题。您可以使用这个[notebook](https://github.com/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb)作为起点。
+5. 除非不可能，否则请始终使用标准数据集，而不是自定义数据集。
+6. 如果可能，尝试使用现有[示例](https://github.com/huggingface/transformers/tree/main/examples/pytorch)之一来重现问题。
+
+需要考虑的因素：
+
+- Deepspeed通常不是问题的原因。
+
+  一些已提交的问题被证明与Deepspeed无关。也就是说，一旦将Deepspeed从设置中移除，问题仍然存在。
+
+  因此，如果问题明显与DeepSpeed相关，例如您可以看到有一个异常并且可以看到DeepSpeed模块涉及其中，请先重新测试没有DeepSpeed的设置。只有当问题仍然存在时，才向Deepspeed提供所有必需的细节。
+
+- 如果您明确问题是在Deepspeed核心中而不是集成部分，请直接向[Deepspeed](https://github.com/microsoft/DeepSpeed/)提交问题。如果您不确定，请不要担心，无论使用哪个issue跟踪问题都可以，一旦您发布问题，我们会弄清楚并将其重定向到另一个issue跟踪（如果需要的话）。
+
+
+
+### Troubleshooting
+
+#### 启动时`deepspeed`进程被终止，没有回溯
+
+如果启动时`deepspeed`进程被终止，没有回溯，这通常意味着程序尝试分配的CPU内存超过了系统的限制或进程被允许分配的内存，操作系统内核杀死了该进程。这是因为您的配置文件很可能将`offload_optimizer`或`offload_param`或两者都配置为卸载到`cpu`。如果您有NVMe，可以尝试在ZeRO-3下卸载到NVMe。这里是如何[估计特定模型所需的内存](https://deepspeed.readthedocs.io/en/latest/memory.html)。
+
+#### 训练和/或评估/预测loss为`NaN`
+
+这种情况通常发生在使用bf16混合精度模式预训练的模型试图在fp16（带或不带混合精度）下使用时。大多数在TPU上训练的模型以及由谷歌发布的模型都属于这个类别（例如，几乎所有基于t5的模型）。在这种情况下，解决方案是要么使用fp32，要么在支持的情况下使用bf16（如TPU、Ampere GPU或更新的版本）。
+
+另一个问题可能与使用fp16有关。当您配置此部分时：
+
+```json
+{
+    "fp16": {
+        "enabled": "auto",
+        "loss_scale": 0,
+        "loss_scale_window": 1000,
+        "initial_scale_power": 16,
+        "hysteresis": 2,
+        "min_loss_scale": 1
+    }
+}
+```
+
+并且您在日志中看到Deepspeed报告`OVERFLOW`如下
+
+```
+0%|                                                                                                                             | 0/189 [00:00<?, ?it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
+  1%|▌                                                                                                                    | 1/189 [00:00<01:26,  2.17it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
+  1%|█▏
+ [...]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 14%|████████████████▌                                                                                                   | 27/189 [00:14<01:13,  2.21it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▏                                                                                                  | 28/189 [00:14<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+ 15%|█████████████████▊                                                                                                  | 29/189 [00:15<01:13,  2.18it/s]
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
+[...]
+```
+
+这意味着Deepspeed损失缩放器无法找到一个克服损失溢出的缩放系数。
+
+在这种情况下，通常需要提高`initial_scale_power`的值。将其设置为`"initial_scale_power": 32`通常会解决问题。
+
+
+
+### 注意事项
+
+- DeepSpeed 与 PyTorch [`Trainer`] 一起工作，但不与 TF [`TFTrainer`] 一起工作。
+- 尽管 DeepSpeed 有一个可安装的 PyPI 包，但强烈建议从源代码安装它，以最好地匹配您的硬件，如果您需要启用某些功能，如 1-bit Adam，这些功能在 pypi 发行版中不可用。
+- 您不必使用🤗  Transformers的 [`Trainer`] 来使用 DeepSpeed   - 您可以使用任何模型与自己的训练器，您还需要根据 [DeepSpeed 集成说明](https://www.deepspeed.ai/getting-started/#writing-deepspeed-models) 调整后者。
+
+
+
+## Non-Trainer Deepspeed集成
+
+当`Trainer`没有被使用时，`~integrations.HfDeepSpeedConfig`被用来将Deepspeed集成到huggingface的Transformers核心功能中。它唯一做的事情就是在`from_pretrained`调用期间处理Deepspeed ZeRO-3参数收集和将模型自动分割到多个GPU上。除此之外，您需要自己完成其他所有工作。
+
+当使用`Trainer`时，所有事情都自动得到了处理。
+
+当不使用`Trainer`时，为了高效地部署Deepspeed ZeRO-3，您必须在实例化模型之前实例化`~integrations.HfDeepSpeedConfig`对象并保持该对象活跃。
+
+如果您正在使用Deepspeed ZeRO-1或ZeRO-2，您根本不需要使用`HfDeepSpeedConfig`。
+
+以预训练模型为例:
+
+```python
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel
+import deepspeed
+
+ds_config = {...}  # deepspeed config object or path to the file
+# must run before instantiating the model to detect zero 3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+model = AutoModel.from_pretrained("gpt2")
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+或者以非预训练模型为例：
+
+```python
+from transformers.integrations import HfDeepSpeedConfig
+from transformers import AutoModel, AutoConfig
+import deepspeed
+
+ds_config = {...}  # deepspeed config object or path to the file
+# must run before instantiating the model to detect zero 3
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+config = AutoConfig.from_pretrained("gpt2")
+model = AutoModel.from_config(config)
+engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
+```
+
+请注意，如果您没有使用[`Trainer`]集成，您完全需要自己动手。基本上遵循[Deepspeed](https://www.deepspeed.ai/)网站上的文档。同时，您必须显式配置配置文件 - 不能使用`"auto"`值，而必须放入实际值。
+
+
+## HfDeepSpeedConfig
+
+[[autodoc]] integrations.HfDeepSpeedConfig
+    - all
+
+### 自定义DeepSpeed ZeRO推理
+
+以下是一个示例，演示了在无法将模型放入单个 GPU 时如果不使用[Trainer]进行 DeepSpeed ZeRO 推理 。该解决方案包括使用额外的 GPU 或/和将 GPU 内存卸载到 CPU 内存。
+
+这里要理解的重要细微差别是，ZeRO的设计方式可以让您在不同的GPU上并行处理不同的输入。
+
+这个例子有很多注释，并且是自文档化的。
+
+请确保：
+
+1. 如果您有足够的GPU内存（因为这会减慢速度），禁用CPU offload。
+2. 如果您拥有Ampere架构或更新的GPU，启用bf16以加快速度。如果您没有这种硬件，只要不使用任何在bf16混合精度下预训练的模型（如大多数t5模型），就可以启用fp16。否则这些模型通常在fp16中溢出，您会看到输出无效结果。
+
+```python
+#!/usr/bin/env python
+
+# This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model
+# into a single GPU
+#
+# 1. Use 1 GPU with CPU offload
+# 2. Or use multiple GPUs instead
+#
+# First you need to install deepspeed: pip install deepspeed
+#
+# Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2
+# small GPUs can handle it. or 1 small GPU and a lot of CPU memory.
+#
+# To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU -
+# you will need 2-4 gpus. And then you can adapt the script to handle more gpus if you want to
+# process multiple inputs at once.
+#
+# The provided deepspeed config also activates CPU memory offloading, so chances are that if you
+# have a lot of available CPU memory and you don't mind a slowdown you should be able to load a
+# model that doesn't normally fit into a single GPU. If you have enough GPU memory the program will
+# run faster if you don't want offload to CPU - so disable that section then.
+#
+# To deploy on 1 gpu:
+#
+# deepspeed --num_gpus 1 t0.py
+# or:
+# python -m torch.distributed.run --nproc_per_node=1 t0.py
+#
+# To deploy on 2 gpus:
+#
+# deepspeed --num_gpus 2 t0.py
+# or:
+# python -m torch.distributed.run --nproc_per_node=2 t0.py
+
+
+from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM
+from transformers.integrations import HfDeepSpeedConfig
+import deepspeed
+import os
+import torch
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To avoid warnings about parallelism in tokenizers
+
+# distributed setup
+local_rank = int(os.getenv("LOCAL_RANK", "0"))
+world_size = int(os.getenv("WORLD_SIZE", "1"))
+torch.cuda.set_device(local_rank)
+deepspeed.init_distributed()
+
+model_name = "bigscience/T0_3B"
+
+config = AutoConfig.from_pretrained(model_name)
+model_hidden_size = config.d_model
+
+# batch size has to be divisible by world_size, but can be bigger than world_size
+train_batch_size = 1 * world_size
+
+# ds_config notes
+#
+# - enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be
+# faster.
+#
+# - for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g.
+# all official t5 models are bf16-pretrained
+#
+# - set offload_param.device to "none" or completely remove the `offload_param` section if you don't
+# - want CPU offload
+#
+# - if using `offload_param` you can manually finetune stage3_param_persistence_threshold to control
+# - which params should remain on gpus - the larger the value the smaller the offload size
+#
+# For indepth info on Deepspeed config see
+# https://huggingface.co/docs/transformers/main/main_classes/deepspeed
+
+# keeping the same format as json for consistency, except it uses lower case for true/false
+# fmt: off
+ds_config = {
+    "fp16": {
+        "enabled": False
+    },
+    "bf16": {
+        "enabled": False
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "offload_param": {
+            "device": "cpu",
+            "pin_memory": True
+        },
+        "overlap_comm": True,
+        "contiguous_gradients": True,
+        "reduce_bucket_size": model_hidden_size * model_hidden_size,
+        "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size,
+        "stage3_param_persistence_threshold": 10 * model_hidden_size
+    },
+    "steps_per_print": 2000,
+    "train_batch_size": train_batch_size,
+    "train_micro_batch_size_per_gpu": 1,
+    "wall_clock_breakdown": False
+}
+# fmt: on
+
+# next line instructs transformers to partition the model directly over multiple gpus using
+# deepspeed.zero.Init when model's `from_pretrained` method is called.
+#
+# **it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name)**
+#
+# otherwise the model will first be loaded normally and only partitioned at forward time which is
+# less efficient and when there is little CPU RAM may fail
+dschf = HfDeepSpeedConfig(ds_config)  # keep this object alive
+
+# now a model can be loaded.
+model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+
+# initialise Deepspeed ZeRO and store only the engine object
+ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0]
+ds_engine.module.eval()  # inference
+
+# Deepspeed ZeRO can process unrelated inputs on each GPU. So for 2 gpus you process 2 inputs at once.
+# If you use more GPUs adjust for more.
+# And of course if you have just one input to process you then need to pass the same string to both gpus
+# If you use only one GPU, then you will have only rank 0.
+rank = torch.distributed.get_rank()
+if rank == 0:
+    text_in = "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy"
+elif rank == 1:
+    text_in = "Is this review positive or negative? Review: this is the worst restaurant ever"
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank)
+with torch.no_grad():
+    outputs = ds_engine.module.generate(inputs, synced_gpus=True)
+text_out = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"rank{rank}:\n   in={text_in}\n  out={text_out}")
+```
+
+让我们保存它为 `t0.py`并运行：
+```
+$ deepspeed --num_gpus 2 t0.py
+rank0:
+   in=Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy
+  out=Positive
+rank1:
+   in=Is this review positive or negative? Review: this is the worst restaurant ever
+  out=negative
+```
+
+这是一个非常基本的例子，您需要根据自己的需求进行修改。
+
+### `generate` 的差异
+
+在使用ZeRO stage 3的多GPU时，需要通过调用`generate(..., synced_gpus=True)`来同步GPU。如果一个GPU在其它GPU之前完成生成，整个系统将挂起，因为其他GPU无法从停止生成的GPU接收权重分片。
+
+从`transformers>=4.28`开始，如果没有明确指定`synced_gpus`，检测到这些条件后它将自动设置为`True`。但如果您需要覆盖`synced_gpus`的值，仍然可以这样做。
+
+
+
+## 测试 DeepSpeed 集成
+
+如果您提交了一个涉及DeepSpeed集成的PR，请注意我们的CircleCI PR CI设置没有GPU，因此我们只在另一个CI夜间运行需要GPU的测试。因此，如果您在PR中获得绿色的CI报告，并不意味着DeepSpeed测试通过。
+
+要运行DeepSpeed测试，请至少运行以下命令：
+
+```
+RUN_SLOW=1 pytest tests/deepspeed/test_deepspeed.py
+```
+
+如果你更改了任何模型或PyTorch示例代码，请同时运行多模型测试。以下将运行所有DeepSpeed测试：
+
+```
+RUN_SLOW=1 pytest tests/deepspeed
+```
+
+## 主要的DeepSpeed资源
+
+- [项目GitHub](https://github.com/microsoft/deepspeed)
+- [使用文档](https://www.deepspeed.ai/getting-started/)
+- [API文档](https://deepspeed.readthedocs.io/en/latest/index.html)
+- [博客文章](https://www.microsoft.com/en-us/research/search/?q=deepspeed)
+
+论文:
+
+- [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054)
+- [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://arxiv.org/abs/2101.06840)
+- [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://arxiv.org/abs/2104.07857)
+
+最后，请记住，HuggingFace [`Trainer`]仅集成了DeepSpeed，因此如果您在使用DeepSpeed时遇到任何问题或疑问，请在[DeepSpeed GitHub](https://github.com/microsoft/DeepSpeed/issues)上提交一个issue。
diff --git a/docs/source/zh/main_classes/feature_extractor.md b/docs/source/zh/main_classes/feature_extractor.md
new file mode 100644
index 00000000000000..f3b65ebf9d66cc
--- /dev/null
+++ b/docs/source/zh/main_classes/feature_extractor.md
@@ -0,0 +1,39 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Feature Extractor
+
+Feature Extractor负责为音频或视觉模型准备输入特征。这包括从序列中提取特征，例如，对音频文件进行预处理以生成Log-Mel频谱特征，以及从图像中提取特征，例如，裁剪图像文件，同时还包括填充、归一化和转换为NumPy、PyTorch和TensorFlow张量。
+
+
+## FeatureExtractionMixin
+
+[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
+    - from_pretrained
+    - save_pretrained
+
+## SequenceFeatureExtractor
+
+[[autodoc]] SequenceFeatureExtractor
+    - pad
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## ImageFeatureExtractionMixin
+
+[[autodoc]] image_utils.ImageFeatureExtractionMixin
diff --git a/docs/source/zh/main_classes/image_processor.md b/docs/source/zh/main_classes/image_processor.md
new file mode 100644
index 00000000000000..035afa55348a26
--- /dev/null
+++ b/docs/source/zh/main_classes/image_processor.md
@@ -0,0 +1,34 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image Processor
+
+Image processor负责为视觉模型准备输入特征并后期处理处理它们的输出。这包括诸如调整大小、归一化和转换为PyTorch、TensorFlow、Flax和NumPy张量等转换。它还可能包括特定于模型的后期处理，例如将logits转换为分割掩码。
+
+
+## ImageProcessingMixin
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
+    - from_pretrained
+    - save_pretrained
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## BaseImageProcessor
+
+[[autodoc]] image_processing_utils.BaseImageProcessor
diff --git a/docs/source/zh/main_classes/keras_callbacks.md b/docs/source/zh/main_classes/keras_callbacks.md
new file mode 100644
index 00000000000000..1eea2eb998162c
--- /dev/null
+++ b/docs/source/zh/main_classes/keras_callbacks.md
@@ -0,0 +1,27 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Keras callbacks
+
+在Keras中训练Transformers模型时，有一些库特定的callbacks函数可用于自动执行常见任务：
+
+## KerasMetricCallback
+
+[[autodoc]] KerasMetricCallback
+
+## PushToHubCallback
+
+[[autodoc]] PushToHubCallback
diff --git a/docs/source/zh/main_classes/logging.md b/docs/source/zh/main_classes/logging.md
new file mode 100644
index 00000000000000..6116e4962e6a33
--- /dev/null
+++ b/docs/source/zh/main_classes/logging.md
@@ -0,0 +1,107 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Logging
+
+🤗 Transformers拥有一个集中式的日志系统，因此您可以轻松设置库输出的日志详细程度。
+
+当前库的默认日志详细程度为`WARNING`。
+
+要更改日志详细程度，只需使用其中一个直接的setter。例如，以下是如何将日志详细程度更改为INFO级别的方法：
+
+```python
+import transformers
+
+transformers.logging.set_verbosity_info()
+```
+
+您还可以使用环境变量`TRANSFORMERS_VERBOSITY`来覆盖默认的日志详细程度。您可以将其设置为以下级别之一：`debug`、`info`、`warning`、`error`、`critical`。例如：
+
+```bash
+TRANSFORMERS_VERBOSITY=error ./myprogram.py
+```
+
+此外，通过将环境变量`TRANSFORMERS_NO_ADVISORY_WARNINGS`设置为`true`（如*1*），可以禁用一些`warnings`。这将禁用[`logger.warning_advice`]记录的任何警告。例如：
+
+```bash
+TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+以下是如何在您自己的模块或脚本中使用与库相同的logger的示例：
+
+```python
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers")
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+
+此日志模块的所有方法都在下面进行了记录，主要的方法包括 [`logging.get_verbosity`] 用于获取logger当前输出日志详细程度的级别和 [`logging.set_verbosity`] 用于将详细程度设置为您选择的级别。按照顺序（从最不详细到最详细），这些级别（及其相应的整数值）为：
+
+- `transformers.logging.CRITICAL` 或 `transformers.logging.FATAL`（整数值，50）：仅报告最关键的errors。
+- `transformers.logging.ERROR`（整数值，40）：仅报告errors。
+- `transformers.logging.WARNING` 或 `transformers.logging.WARN`（整数值，30）：仅报告error和warnings。这是库使用的默认级别。
+- `transformers.logging.INFO`（整数值，20）：报告error、warnings和基本信息。
+- `transformers.logging.DEBUG`（整数值，10）：报告所有信息。
+
+默认情况下，将在模型下载期间显示`tqdm`进度条。[`logging.disable_progress_bar`] 和 [`logging.enable_progress_bar`] 可用于禁止或启用此行为。
+
+## `logging` vs `warnings`
+
+Python有两个经常一起使用的日志系统：如上所述的`logging`，和对特定buckets中的警告进行进一步分类的`warnings`，例如，`FutureWarning`用于输出已经被弃用的功能或路径，`DeprecationWarning`用于指示即将被弃用的内容。
+
+我们在`transformers`库中同时使用这两个系统。我们利用并调整了`logging`的`captureWarning`方法，以便通过上面的详细程度setters来管理这些警告消息。
+
+对于库的开发人员，这意味着什么呢？我们应该遵循以下启发法则：
+- 库的开发人员和依赖于`transformers`的库应优先使用`warnings`
+- `logging`应该用于在日常项目中经常使用它的用户
+
+以下是`captureWarnings`方法的参考。
+
+[[autodoc]] logging.captureWarnings
+
+## Base setters
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## Other functions
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
+
+[[autodoc]] logging.enable_progress_bar
+
+[[autodoc]] logging.disable_progress_bar
diff --git a/docs/source/zh/main_classes/model.md b/docs/source/zh/main_classes/model.md
index d1b1c98f59a373..6c0ee3e2b2c097 100644
--- a/docs/source/zh/main_classes/model.md
+++ b/docs/source/zh/main_classes/model.md
@@ -119,16 +119,16 @@ model = AutoModel.from_config(config)
 
 TFPreTrainedModel
 [[autodoc]] TFPreTrainedModel
-- push_to_hub
-- all
+    - push_to_hub
+    - all
 
 ## TFModelUtilsMixin
 [[autodoc]] modeling_tf_utils.TFModelUtilsMixin
 
 FlaxPreTrainedModel
 [[autodoc]] FlaxPreTrainedModel
-- push_to_hub
-- all
+    - push_to_hub
+    - all
 
 ## 推送到 Hub
 [[autodoc]] utils.PushToHubMixin
diff --git a/docs/source/zh/main_classes/onnx.md b/docs/source/zh/main_classes/onnx.md
new file mode 100644
index 00000000000000..d35dd1c4bf8e6a
--- /dev/null
+++ b/docs/source/zh/main_classes/onnx.md
@@ -0,0 +1,50 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 导出 🤗 Transformers 模型到 ONNX
+
+🤗 Transformers提供了一个`transformers.onnx`包，通过利用配置对象，您可以将模型checkpoints转换为ONNX图。
+
+有关更多详细信息，请参阅导出 🤗 Transformers 模型的[指南](../serialization)。
+
+## ONNX Configurations
+
+我们提供了三个抽象类，取决于您希望导出的模型架构类型：
+
+* 基于编码器的模型继承 [`~onnx.config.OnnxConfig`]
+* 基于解码器的模型继承 [`~onnx.config.OnnxConfigWithPast`]
+* 编码器-解码器模型继承 [`~onnx.config.OnnxSeq2SeqConfigWithPast`]
+
+### OnnxConfig
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX Features
+
+每个ONNX配置与一组 _特性_ 相关联，使您能够为不同类型的拓扑结构或任务导出模型。
+
+### FeaturesManager
+
+[[autodoc]] onnx.features.FeaturesManager
+
diff --git a/docs/source/zh/main_classes/optimizer_schedules.md b/docs/source/zh/main_classes/optimizer_schedules.md
new file mode 100644
index 00000000000000..63e5438d77b6c4
--- /dev/null
+++ b/docs/source/zh/main_classes/optimizer_schedules.md
@@ -0,0 +1,77 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimization
+
+`.optimization` 模块提供了：
+
+- 一个带有固定权重衰减的优化器，可用于微调模型
+- 继承自 `_LRSchedule` 多个调度器：
+- 一个梯度累积类，用于累积多个批次的梯度
+
+## AdamW (PyTorch)
+
+[[autodoc]] AdamW
+
+## AdaFactor (PyTorch)
+
+[[autodoc]] Adafactor
+
+## AdamWeightDecay (TensorFlow)
+
+[[autodoc]] AdamWeightDecay
+
+[[autodoc]] create_optimizer
+
+## Schedules
+
+### Learning Rate Schedules (Pytorch)
+
+[[autodoc]] SchedulerType
+
+[[autodoc]] get_scheduler
+
+[[autodoc]] get_constant_schedule
+
+[[autodoc]] get_constant_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
+
+[[autodoc]] get_cosine_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
+
+[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
+
+[[autodoc]] get_linear_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
+
+[[autodoc]] get_polynomial_decay_schedule_with_warmup
+
+[[autodoc]] get_inverse_sqrt_schedule
+
+### Warmup (TensorFlow)
+
+[[autodoc]] WarmUp
+
+## Gradient Strategies
+
+### GradientAccumulator (TensorFlow)
+
+[[autodoc]] GradientAccumulator
diff --git a/docs/source/zh/main_classes/output.md b/docs/source/zh/main_classes/output.md
new file mode 100644
index 00000000000000..1619e27219d834
--- /dev/null
+++ b/docs/source/zh/main_classes/output.md
@@ -0,0 +1,309 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 模型输出
+
+所有模型的输出都是 [`~utils.ModelOutput`] 的子类的实例。这些是包含模型返回的所有信息的数据结构，但也可以用作元组或字典。
+
+让我们看一个例子：
+
+```python
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+outputs = model(**inputs, labels=labels)
+```
+
+`outputs` 对象是 [`~modeling_outputs.SequenceClassifierOutput`]，如下面该类的文档中所示，它表示它有一个可选的 `loss`，一个 `logits`，一个可选的 `hidden_states` 和一个可选的 `attentions` 属性。在这里，我们有 `loss`，因为我们传递了 `labels`，但我们没有 `hidden_states` 和 `attentions`，因为我们没有传递 `output_hidden_states=True` 或 `output_attentions=True`。
+
+<Tip>
+
+当传递 `output_hidden_states=True` 时，您可能希望 `outputs.hidden_states[-1]` 与 `outputs.last_hidden_states` 完全匹配。然而，这并不总是成立。一些模型在返回最后的 hidden state时对其应用归一化或其他后续处理。
+
+</Tip>
+
+
+您可以像往常一样访问每个属性，如果模型未返回该属性，您将得到 `None`。在这里，例如，`outputs.loss` 是模型计算的损失，而 `outputs.attentions` 是 `None`。
+
+当将我们的 `outputs` 对象视为元组时，它仅考虑那些没有 `None` 值的属性。例如这里它有两个元素，`loss` 和 `logits`，所以
+
+```python
+outputs[:2]
+```
+
+将返回元组 `(outputs.loss, outputs.logits)`。
+
+将我们的 `outputs` 对象视为字典时，它仅考虑那些没有 `None` 值的属性。例如在这里它有两个键，分别是 `loss` 和 `logits`。
+
+我们在这里记录了被多个类型模型使用的通用模型输出。特定输出类型在其相应的模型页面上有文档。
+
+## ModelOutput
+
+[[autodoc]] utils.ModelOutput
+    - to_tuple
+
+## BaseModelOutput
+
+[[autodoc]] modeling_outputs.BaseModelOutput
+
+## BaseModelOutputWithPooling
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
+
+## BaseModelOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
+
+## BaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+
+## BaseModelOutputWithPast
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPast
+
+## BaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+
+## Seq2SeqModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqModelOutput
+
+## CausalLMOutput
+
+[[autodoc]] modeling_outputs.CausalLMOutput
+
+## CausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
+
+## CausalLMOutputWithPast
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithPast
+
+## MaskedLMOutput
+
+[[autodoc]] modeling_outputs.MaskedLMOutput
+
+## Seq2SeqLMOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqLMOutput
+
+## NextSentencePredictorOutput
+
+[[autodoc]] modeling_outputs.NextSentencePredictorOutput
+
+## SequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.SequenceClassifierOutput
+
+## Seq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
+
+## MultipleChoiceModelOutput
+
+[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
+
+## TokenClassifierOutput
+
+[[autodoc]] modeling_outputs.TokenClassifierOutput
+
+## QuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
+
+## Seq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+
+## Seq2SeqSpectrogramOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput
+
+## SemanticSegmenterOutput
+
+[[autodoc]] modeling_outputs.SemanticSegmenterOutput
+
+## ImageClassifierOutput
+
+[[autodoc]] modeling_outputs.ImageClassifierOutput
+
+## ImageClassifierOutputWithNoAttention
+
+[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
+
+## DepthEstimatorOutput
+
+[[autodoc]] modeling_outputs.DepthEstimatorOutput
+
+## Wav2Vec2BaseModelOutput
+
+[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
+
+## XVectorOutput
+
+[[autodoc]] modeling_outputs.XVectorOutput
+
+## Seq2SeqTSModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
+
+## Seq2SeqTSPredictionOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
+
+## SampleTSPredictionOutput
+
+[[autodoc]] modeling_outputs.SampleTSPredictionOutput
+
+## TFBaseModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
+
+## TFBaseModelOutputWithPooling
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
+
+## TFBaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+
+## TFBaseModelOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
+
+## TFBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+
+## TFSeq2SeqModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
+
+## TFCausalLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
+
+## TFCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+
+## TFCausalLMOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
+
+## TFMaskedLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
+
+## TFSeq2SeqLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
+
+## TFNextSentencePredictorOutput
+
+[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
+
+## TFSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
+
+## TFSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+
+## TFMultipleChoiceModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
+
+## TFTokenClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
+
+## TFQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
+
+## TFSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+
+## FlaxBaseModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
+
+## FlaxBaseModelOutputWithPast
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+## FlaxBaseModelOutputWithPooling
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+## FlaxBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+## FlaxSeq2SeqModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+## FlaxCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+## FlaxMaskedLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
+
+## FlaxSeq2SeqLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+## FlaxNextSentencePredictorOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+## FlaxSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+## FlaxSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+## FlaxMultipleChoiceModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+## FlaxTokenClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
+
+## FlaxQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+## FlaxSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
diff --git a/docs/source/zh/main_classes/pipelines.md b/docs/source/zh/main_classes/pipelines.md
new file mode 100644
index 00000000000000..4d2f1f0f9386a3
--- /dev/null
+++ b/docs/source/zh/main_classes/pipelines.md
@@ -0,0 +1,474 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines
+
+pipelines是使用模型进行推理的一种简单方法。这些pipelines是抽象了库中大部分复杂代码的对象，提供了一个专用于多个任务的简单API，包括专名识别、掩码语言建模、情感分析、特征提取和问答等。请参阅[任务摘要](../task_summary)以获取使用示例。
+
+有两种pipelines抽象类需要注意：
+
+- [`pipeline`]，它是封装所有其他pipelines的最强大的对象。
+- 针对特定任务pipelines，适用于[音频](#audio)、[计算机视觉](#computer-vision)、[自然语言处理](#natural-language-processing)和[多模态](#multimodal)任务。
+
+## pipeline抽象类
+
+*pipeline*抽象类是对所有其他可用pipeline的封装。它可以像任何其他pipeline一样实例化，但进一步提供额外的便利性。
+
+简单调用一个项目：
+
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe("This restaurant is awesome")
+[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+```
+
+如果您想使用 [hub](https://huggingface.co) 上的特定模型，可以忽略任务，如果hub上的模型已经定义了该任务：
+
+```python
+>>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe("This restaurant is awesome")
+[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
+```
+
+要在多个项目上调用pipeline，可以使用*列表*调用它。
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe(["This restaurant is awesome", "This restaurant is awful"])
+[{'label': 'POSITIVE', 'score': 0.9998743534088135},
+ {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+```
+
+为了遍历整个数据集，建议直接使用 `dataset`。这意味着您不需要一次性分配整个数据集，也不需要自己进行批处理。这应该与GPU上的自定义循环一样快。如果不是，请随时提出issue。
+
+```python
+import datasets
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
+# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
+for out in tqdm(pipe(KeyDataset(dataset, "file"))):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+为了方便使用，也可以使用生成器：
+
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("text-classification")
+
+
+def data():
+    while True:
+        # This could come from a dataset, a database, a queue or HTTP request
+        # in a server
+        # Caveat: because this is iterative, you cannot use `num_workers > 1` variable
+        # to use multiple threads to preprocess data. You can still have 1 thread that
+        # does the preprocessing while the main runs the big inference
+        yield "This is a test"
+
+
+for out in pipe(data()):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+[[autodoc]] pipeline
+
+## Pipeline batching
+
+所有pipeline都可以使用批处理。这将在pipeline使用其流处理功能时起作用（即传递列表或 `Dataset` 或 `generator` 时）。
+
+```python
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+import datasets
+
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipe = pipeline("text-classification", device=0)
+for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+    print(out)
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+```
+
+<Tip warning={true}>
+
+然而，这并不自动意味着性能提升。它可能是一个10倍的加速或5倍的减速，具体取决于硬件、数据和实际使用的模型。
+
+主要是加速的示例：
+
+</Tip>
+
+```python
+from transformers import pipeline
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("text-classification", device=0)
+
+
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        return "This is a test"
+
+
+dataset = MyDataset()
+
+for batch_size in [1, 8, 64, 256]:
+    print("-" * 30)
+    print(f"Streaming batch_size={batch_size}")
+    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
+        pass
+```
+
+```
+# On GTX 970
+------------------------------
+Streaming no batching
+100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
+------------------------------
+Streaming batch_size=64
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
+------------------------------
+Streaming batch_size=256
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
+(diminishing returns, saturated the GPU)
+```
+
+主要是减速的示例：
+
+```python
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        if i % 64 == 0:
+            n = 100
+        else:
+            n = 1
+        return "This is a test" * n
+```
+
+与其他句子相比，这是一个非常长的句子。在这种情况下，**整个**批次将需要400个tokens的长度，因此整个批次将是 [64, 400] 而不是 [64, 4]，从而导致较大的减速。更糟糕的是，在更大的批次上，程序会崩溃。
+
+```
+------------------------------
+Streaming no batching
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
+------------------------------
+Streaming batch_size=64
+100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
+------------------------------
+Streaming batch_size=256
+  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
+    for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
+....
+    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
+```
+
+对于这个问题，没有好的（通用）解决方案，效果可能因您的用例而异。经验法则如下：
+
+对于用户，一个经验法则是：
+
+- **使用硬件测量负载性能。测量、测量、再测量。真实的数字是唯一的方法。**
+- 如果受到延迟的限制（进行推理的实时产品），不要进行批处理。
+- 如果使用CPU，不要进行批处理。
+- 如果您在GPU上处理的是吞吐量（您希望在大量静态数据上运行模型），则：
+  - 如果对序列长度的大小没有概念（"自然"数据），默认情况下不要进行批处理，进行测试并尝试逐渐添加，添加OOM检查以在失败时恢复（如果您不能控制序列长度，它将在某些时候失败）。
+  - 如果您的序列长度非常规律，那么批处理更有可能非常有趣，进行测试并推动它，直到出现OOM。
+  - GPU越大，批处理越有可能变得更有趣
+- 一旦启用批处理，确保能够很好地处理OOM。
+
+## Pipeline chunk batching
+
+`zero-shot-classification` 和 `question-answering` 在某种意义上稍微特殊，因为单个输入可能会导致模型的多次前向传递。在正常情况下，这将导致 `batch_size` 参数的问题。
+
+为了规避这个问题，这两个pipeline都有点特殊，它们是 `ChunkPipeline` 而不是常规的 `Pipeline`。简而言之：
+
+
+```python
+preprocessed = pipe.preprocess(inputs)
+model_outputs = pipe.forward(preprocessed)
+outputs = pipe.postprocess(model_outputs)
+```
+
+现在变成：
+
+
+```python
+all_model_outputs = []
+for preprocessed in pipe.preprocess(inputs):
+    model_outputs = pipe.forward(preprocessed)
+    all_model_outputs.append(model_outputs)
+outputs = pipe.postprocess(all_model_outputs)
+```
+
+这对您的代码应该是非常直观的，因为pipeline的使用方式是相同的。
+
+这是一个简化的视图，因为Pipeline可以自动处理批次！这意味着您不必担心您的输入实际上会触发多少次前向传递，您可以独立于输入优化 `batch_size`。前面部分的注意事项仍然适用。
+
+## Pipeline自定义
+
+如果您想要重载特定的pipeline。
+
+请随时为您手头的任务创建一个issue，Pipeline的目标是易于使用并支持大多数情况，因此 `transformers` 可能支持您的用例。
+
+如果您想简单地尝试一下，可以：
+
+- 继承您选择的pipeline
+
+```python
+class MyPipeline(TextClassificationPipeline):
+    def postprocess():
+        # Your code goes here
+        scores = scores * 100
+        # And here
+
+
+my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
+# or if you use *pipeline* function, then:
+my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
+```
+
+这样就可以让您编写所有想要的自定义代码。
+
+
+## 实现一个pipeline
+
+[实现一个新的pipeline](../add_new_pipeline)
+
+## 音频
+
+可用于音频任务的pipeline包括以下几种。
+
+### AudioClassificationPipeline
+
+[[autodoc]] AudioClassificationPipeline
+    - __call__
+    - all
+
+### AutomaticSpeechRecognitionPipeline
+
+[[autodoc]] AutomaticSpeechRecognitionPipeline
+    - __call__
+    - all
+
+### TextToAudioPipeline
+
+[[autodoc]] TextToAudioPipeline
+    - __call__
+    - all
+
+
+### ZeroShotAudioClassificationPipeline
+
+[[autodoc]] ZeroShotAudioClassificationPipeline
+    - __call__
+    - all
+
+## 计算机视觉
+
+可用于计算机视觉任务的pipeline包括以下几种。
+
+### DepthEstimationPipeline
+[[autodoc]] DepthEstimationPipeline
+    - __call__
+    - all
+
+### ImageClassificationPipeline
+
+[[autodoc]] ImageClassificationPipeline
+    - __call__
+    - all
+
+### ImageSegmentationPipeline
+
+[[autodoc]] ImageSegmentationPipeline
+    - __call__
+    - all
+
+### ImageToImagePipeline
+
+[[autodoc]] ImageToImagePipeline
+    - __call__
+    - all
+
+### ObjectDetectionPipeline
+
+[[autodoc]] ObjectDetectionPipeline
+    - __call__
+    - all
+
+### VideoClassificationPipeline
+
+[[autodoc]] VideoClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotImageClassificationPipeline
+
+[[autodoc]] ZeroShotImageClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotObjectDetectionPipeline
+
+[[autodoc]] ZeroShotObjectDetectionPipeline
+    - __call__
+    - all
+
+## 自然语言处理
+
+可用于自然语言处理任务的pipeline包括以下几种。
+
+### ConversationalPipeline
+
+[[autodoc]] Conversation
+
+[[autodoc]] ConversationalPipeline
+    - __call__
+    - all
+
+### FillMaskPipeline
+
+[[autodoc]] FillMaskPipeline
+    - __call__
+    - all
+
+### NerPipeline
+
+[[autodoc]] NerPipeline
+
+See [`TokenClassificationPipeline`] for all details.
+
+### QuestionAnsweringPipeline
+
+[[autodoc]] QuestionAnsweringPipeline
+    - __call__
+    - all
+
+### SummarizationPipeline
+
+[[autodoc]] SummarizationPipeline
+    - __call__
+    - all
+
+### TableQuestionAnsweringPipeline
+
+[[autodoc]] TableQuestionAnsweringPipeline
+    - __call__
+
+### TextClassificationPipeline
+
+[[autodoc]] TextClassificationPipeline
+    - __call__
+    - all
+
+### TextGenerationPipeline
+
+[[autodoc]] TextGenerationPipeline
+    - __call__
+    - all
+
+### Text2TextGenerationPipeline
+
+[[autodoc]] Text2TextGenerationPipeline
+    - __call__
+    - all
+
+### TokenClassificationPipeline
+
+[[autodoc]] TokenClassificationPipeline
+    - __call__
+    - all
+
+### TranslationPipeline
+
+[[autodoc]] TranslationPipeline
+    - __call__
+    - all
+
+### ZeroShotClassificationPipeline
+
+[[autodoc]] ZeroShotClassificationPipeline
+    - __call__
+    - all
+
+## 多模态 
+
+可用于多模态任务的pipeline包括以下几种。
+
+### DocumentQuestionAnsweringPipeline
+
+[[autodoc]] DocumentQuestionAnsweringPipeline
+    - __call__
+    - all
+
+### FeatureExtractionPipeline
+
+[[autodoc]] FeatureExtractionPipeline
+    - __call__
+    - all
+
+### ImageToTextPipeline
+
+[[autodoc]] ImageToTextPipeline
+    - __call__
+    - all
+
+### MaskGenerationPipeline
+
+[[autodoc]] MaskGenerationPipeline
+    - __call__
+    - all
+
+### VisualQuestionAnsweringPipeline
+
+[[autodoc]] VisualQuestionAnsweringPipeline
+    - __call__
+    - all
+
+## Parent class: `Pipeline`
+
+[[autodoc]] Pipeline
diff --git a/docs/source/zh/main_classes/processors.md b/docs/source/zh/main_classes/processors.md
new file mode 100644
index 00000000000000..60167e317adf90
--- /dev/null
+++ b/docs/source/zh/main_classes/processors.md
@@ -0,0 +1,146 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Processors
+
+在 Transformers 库中，processors可以有两种不同的含义：
+- 为多模态模型，例如[Wav2Vec2](../model_doc/wav2vec2)（语音和文本）或[CLIP](../model_doc/clip)（文本和视觉）预处理输入的对象
+- 在库的旧版本中用于预处理GLUE或SQUAD数据的已弃用对象。
+
+## 多模态processors
+
+任何多模态模型都需要一个对象来编码或解码将多个模态（包括文本、视觉和音频）组合在一起的数据。这由称为processors的对象处理，这些processors将两个或多个处理对象组合在一起，例如tokenizers（用于文本模态），image processors（用于视觉）和feature extractors（用于音频）。
+
+这些processors继承自以下实现保存和加载功能的基类：
+
+
+[[autodoc]] ProcessorMixin
+
+## 已弃用的processors
+
+所有processor都遵循与 [`~data.processors.utils.DataProcessor`] 相同的架构。processor返回一个 [`~data.processors.utils.InputExample`] 列表。这些 [`~data.processors.utils.InputExample`] 可以转换为 [`~data.processors.utils.InputFeatures`] 以供输送到模型。
+
+[[autodoc]] data.processors.utils.DataProcessor
+
+[[autodoc]] data.processors.utils.InputExample
+
+[[autodoc]] data.processors.utils.InputFeatures
+
+## GLUE
+
+[General Language Understanding Evaluation (GLUE)](https://gluebenchmark.com/) 是一个基准测试，评估模型在各种现有的自然语言理解任务上的性能。它与论文 [GLUE: A multi-task benchmark and analysis platform for natural language understanding](https://openreview.net/pdf?id=rJ4km2R5t7) 一同发布。
+
+该库为以下任务提供了总共10个processor：MRPC、MNLI、MNLI（mismatched）、CoLA、SST2、STSB、QQP、QNLI、RTE 和 WNLI。
+
+这些processor是：
+
+- [`~data.processors.utils.MrpcProcessor`]
+- [`~data.processors.utils.MnliProcessor`]
+- [`~data.processors.utils.MnliMismatchedProcessor`]
+- [`~data.processors.utils.Sst2Processor`]
+- [`~data.processors.utils.StsbProcessor`]
+- [`~data.processors.utils.QqpProcessor`]
+- [`~data.processors.utils.QnliProcessor`]
+- [`~data.processors.utils.RteProcessor`]
+- [`~data.processors.utils.WnliProcessor`]
+
+此外，还可以使用以下方法从数据文件加载值并将其转换为 [`~data.processors.utils.InputExample`] 列表。
+
+[[autodoc]] data.processors.glue.glue_convert_examples_to_features
+
+
+## XNLI
+
+[跨语言NLI语料库（XNLI）](https://www.nyu.edu/projects/bowman/xnli/) 是一个评估跨语言文本表示质量的基准测试。XNLI是一个基于[*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/)的众包数据集：”文本对“被标记为包含15种不同语言（包括英语等高资源语言和斯瓦希里语等低资源语言）的文本蕴涵注释。
+
+它与论文 [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053) 一同发布。
+
+该库提供了加载XNLI数据的processor：
+
+- [`~data.processors.utils.XnliProcessor`]
+
+请注意，由于测试集上有“gold”标签，因此评估是在测试集上进行的。
+
+使用这些processor的示例在 [run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification/run_xnli.py) 脚本中提供。
+
+
+## SQuAD
+
+[斯坦福问答数据集（SQuAD）](https://rajpurkar.github.io/SQuAD-explorer//) 是一个评估模型在问答上性能的基准测试。有两个版本，v1.1 和 v2.0。第一个版本（v1.1）与论文 [SQuAD: 100,000+ Questions for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250) 一同发布。第二个版本（v2.0）与论文 [Know What You Don't Know: Unanswerable Questions for SQuAD](https://arxiv.org/abs/1806.03822) 一同发布。
+
+该库为两个版本各自提供了一个processor：
+
+### Processors
+
+这两个processor是：
+
+- [`~data.processors.utils.SquadV1Processor`]
+- [`~data.processors.utils.SquadV2Processor`]
+
+它们都继承自抽象类 [`~data.processors.utils.SquadProcessor`]。
+
+[[autodoc]] data.processors.squad.SquadProcessor
+    - all
+
+此外，可以使用以下方法将 SQuAD 示例转换为可用作模型输入的 [`~data.processors.utils.SquadFeatures`]。
+
+[[autodoc]] data.processors.squad.squad_convert_examples_to_features
+
+
+这些processor以及前面提到的方法可以与包含数据的文件以及tensorflow_datasets包一起使用。下面给出了示例。
+
+
+### Example使用
+
+以下是使用processor以及使用数据文件的转换方法的示例：
+
+```python
+# Loading a V2 processor
+processor = SquadV2Processor()
+examples = processor.get_dev_examples(squad_v2_data_dir)
+
+# Loading a V1 processor
+processor = SquadV1Processor()
+examples = processor.get_dev_examples(squad_v1_data_dir)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+使用 *tensorflow_datasets* 就像使用数据文件一样简单：
+
+```python
+# tensorflow_datasets only handle Squad V1.
+tfds_examples = tfds.load("squad")
+examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+另一个使用这些processor的示例在 [run_squad.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering/run_squad.py) 脚本中提供。
\ No newline at end of file
diff --git a/docs/source/zh/main_classes/quantization.md b/docs/source/zh/main_classes/quantization.md
new file mode 100644
index 00000000000000..3c7e4d9212a1d0
--- /dev/null
+++ b/docs/source/zh/main_classes/quantization.md
@@ -0,0 +1,572 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 量化 🤗 Transformers 模型
+
+## AWQ集成
+
+AWQ方法已经在[*AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration*论文](https://arxiv.org/abs/2306.00978)中引入。通过AWQ，您可以以4位精度运行模型，同时保留其原始性能（即没有性能降级），并具有比下面介绍的其他量化方法更出色的吞吐量 - 达到与纯`float16`推理相似的吞吐量。
+
+我们现在支持使用任何AWQ模型进行推理，这意味着任何人都可以加载和使用在Hub上推送或本地保存的AWQ权重。请注意，使用AWQ需要访问NVIDIA GPU。目前不支持CPU推理。
+
+
+### 量化一个模型
+
+我们建议用户查看生态系统中不同的现有工具，以使用AWQ算法对其模型进行量化，例如：
+
+- [`llm-awq`](https://github.com/mit-han-lab/llm-awq)，来自MIT Han Lab
+- [`autoawq`](https://github.com/casper-hansen/AutoAWQ)，来自[`casper-hansen`](https://github.com/casper-hansen)
+- Intel neural compressor，来自Intel - 通过[`optimum-intel`](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc)使用
+
+生态系统中可能存在许多其他工具，请随时提出PR将它们添加到列表中。
+目前与🤗 Transformers的集成仅适用于使用`autoawq`和`llm-awq`量化后的模型。大多数使用`auto-awq`量化的模型可以在🤗 Hub的[`TheBloke`](https://huggingface.co/TheBloke)命名空间下找到，要使用`llm-awq`对模型进行量化，请参阅[`llm-awq`](https://github.com/mit-han-lab/llm-awq/)的示例文件夹中的[`convert_to_hf.py`](https://github.com/mit-han-lab/llm-awq/blob/main/examples/convert_to_hf.py)脚本。
+
+
+### 加载一个量化的模型
+
+您可以使用`from_pretrained`方法从Hub加载一个量化模型。通过检查模型配置文件（`configuration.json`）中是否存在`quantization_config`属性，来进行确认推送的权重是量化的。您可以通过检查字段`quantization_config.quant_method`来确认模型是否以AWQ格式进行量化，该字段应该设置为`"awq"`。请注意，为了性能原因，默认情况下加载模型将设置其他权重为`float16`。如果您想更改这种设置，可以通过将`torch_dtype`参数设置为`torch.float32`或`torch.bfloat16`。在下面的部分中，您可以找到一些示例片段和notebook。
+
+
+## 示例使用
+
+首先，您需要安装[`autoawq`](https://github.com/casper-hansen/AutoAWQ)库
+
+```bash
+pip install autoawq
+```
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
+```
+
+如果您首先将模型加载到CPU上，请确保在使用之前将其移动到GPU设备上。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda:0")
+```
+
+### 结合 AWQ 和 Flash Attention
+
+您可以将AWQ量化与Flash Attention结合起来，得到一个既被量化又更快速的模型。只需使用`from_pretrained`加载模型，并传递`attn_implementation="flash_attention_2"`参数。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", attn_implementation="flash_attention_2", device_map="cuda:0")
+```
+
+### 基准测试
+
+我们使用[`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark)库进行了一些速度、吞吐量和延迟基准测试。
+
+请注意，在编写本文档部分时，可用的量化方法包括：`awq`、`gptq`和`bitsandbytes`。
+
+基准测试在一台NVIDIA-A100实例上运行，使用[`TheBloke/Mistral-7B-v0.1-AWQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ)作为AWQ模型，[`TheBloke/Mistral-7B-v0.1-GPTQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ)作为GPTQ模型。我们还将其与`bitsandbytes`量化模型和`float16`模型进行了对比。以下是一些结果示例：
+
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png">
+</div>
+
+你可以在[此链接](https://github.com/huggingface/optimum-benchmark/tree/main/examples/running-mistrals)中找到完整的结果以及包版本。
+
+从结果来看，AWQ量化方法是推理、文本生成中最快的量化方法，并且在文本生成的峰值内存方面属于最低。然而，对于每批数据，AWQ似乎有最大的前向延迟。
+
+
+### Google colab 演示
+
+查看如何在[Google Colab演示](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)中使用此集成！
+
+
+### AwqConfig
+
+[[autodoc]] AwqConfig
+
+## `AutoGPTQ` 集成
+
+🤗 Transformers已经整合了`optimum` API，用于对语言模型执行GPTQ量化。您可以以8、4、3甚至2位加载和量化您的模型，而性能无明显下降，并且推理速度更快！这受到大多数GPU硬件的支持。
+
+要了解更多关于量化模型的信息，请查看：
+- [GPTQ](https://arxiv.org/pdf/2210.17323.pdf)论文
+- `optimum`关于GPTQ量化的[指南](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization)
+- 用作后端的[`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ)库
+
+
+### 要求
+
+为了运行下面的代码，您需要安装：
+
+- 安装最新版本的 `AutoGPTQ` 库
+`pip install auto-gptq`
+
+- 从源代码安装最新版本的`optimum`
+`pip install git+https://github.com/huggingface/optimum.git`
+
+- 从源代码安装最新版本的`transformers`
+`pip install git+https://github.com/huggingface/transformers.git`
+
+- 安装最新版本的`accelerate`库： 
+`pip install --upgrade accelerate`
+
+请注意，目前GPTQ集成仅支持文本模型，对于视觉、语音或多模态模型可能会遇到预期以外结果。
+
+### 加载和量化模型
+
+GPTQ是一种在使用量化模型之前需要进行权重校准的量化方法。如果您想从头开始对transformers模型进行量化，生成量化模型可能需要一些时间（在Google Colab上对`facebook/opt-350m`模型量化约为5分钟）。
+
+因此，有两种不同的情况下您可能想使用GPTQ量化模型。第一种情况是加载已经由其他用户在Hub上量化的模型，第二种情况是从头开始对您的模型进行量化并保存或推送到Hub，以便其他用户也可以使用它。
+
+
+#### GPTQ 配置
+
+为了加载和量化一个模型，您需要创建一个[`GPTQConfig`]。您需要传递`bits`的数量，一个用于校准量化的`dataset`，以及模型的`tokenizer`以准备数据集。
+
+```python 
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
+```
+
+请注意，您可以将自己的数据集以字符串列表形式传递到模型。然而，强烈建议您使用GPTQ论文中提供的数据集。
+
+
+```python
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
+```
+
+#### 量化
+
+您可以通过使用`from_pretrained`并设置`quantization_config`来对模型进行量化。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
+
+```
+
+请注意，您需要一个GPU来量化模型。我们将模型放在cpu中，并将模块来回移动到gpu中，以便对其进行量化。
+
+如果您想在使用 CPU 卸载的同时最大化 GPU 使用率，您可以设置 `device_map = "auto"`。
+
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+
+请注意，不支持磁盘卸载。此外，如果由于数据集而内存不足，您可能需要在`from_pretrained`中设置`max_memory`。查看这个[指南](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map)以了解有关`device_map`和`max_memory`的更多信息。
+
+
+<Tip warning={true}>
+目前，GPTQ量化仅适用于文本模型。此外，量化过程可能会花费很多时间，具体取决于硬件性能（175B模型在NVIDIA A100上需要4小时）。请在Hub上检查是否有模型的GPTQ量化版本。如果没有，您可以在GitHub上提交需求。 
+</Tip>
+
+### 推送量化模型到 🤗 Hub
+
+您可以使用`push_to_hub`将量化模型像任何模型一样推送到Hub。量化配置将与模型一起保存和推送。
+
+```python
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+如果您想在本地计算机上保存量化模型，您也可以使用`save_pretrained`来完成：
+
+```python
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+```
+
+请注意，如果您量化模型时想使用`device_map`，请确保在保存之前将整个模型移动到您的GPU或CPU之一。
+
+```python
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+### 从 🤗 Hub 加载一个量化模型
+
+您可以使用`from_pretrained`从Hub加载量化模型。
+请确保推送权重是量化的，检查模型配置对象中是否存在`quantization_config`属性。
+
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
+```
+
+如果您想更快地加载模型，并且不需要分配比实际需要内存更多的内存，量化模型也使用`device_map`参数。确保您已安装`accelerate`库。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+### Exllama内核加快推理速度
+
+保留格式：对于 4 位模型，您可以使用 exllama 内核来提高推理速度。默认情况下，它处于启用状态。您可以通过在 [`GPTQConfig`] 中传递 `use_exllama` 来更改此配置。这将覆盖存储在配置中的量化配置。请注意，您只能覆盖与内核相关的属性。此外，如果您想使用 exllama 内核，整个模型需要全部部署在 gpus 上。此外，您可以使用 版本 > 0.4.2 的 Auto-GPTQ 并传递 `device_map` = "cpu" 来执行 CPU 推理。对于 CPU 推理，您必须在 `GPTQConfig` 中传递 `use_exllama = False`。
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
+```
+
+随着 exllamav2 内核的发布，与 exllama 内核相比，您可以获得更快的推理速度。您只需在 [`GPTQConfig`] 中传递 `exllama_config={"version": 2}`：
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
+```
+
+请注意，目前仅支持 4 位模型。此外，如果您正在使用 peft 对量化模型进行微调，建议禁用 exllama 内核。 
+
+您可以在此找到这些内核的基准测试 [这里](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
+
+
+#### 微调一个量化模型
+
+在Hugging Face生态系统的官方支持下，您可以使用GPTQ进行量化后的模型进行微调。 
+请查看`peft`库了解更多详情。
+
+### 示例演示
+
+请查看 Google Colab [notebook](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94ilkUFu6ZX4ceb?usp=sharing)，了解如何使用GPTQ量化您的模型以及如何使用peft微调量化模型。
+
+### GPTQConfig
+
+[[autodoc]] GPTQConfig
+
+
+## `bitsandbytes` 集成
+
+🤗 Transformers 与 `bitsandbytes` 上最常用的模块紧密集成。您可以使用几行代码以 8 位精度加载您的模型。
+自bitsandbytes的0.37.0版本发布以来，大多数GPU硬件都支持这一点。
+
+在[LLM.int8()](https://arxiv.org/abs/2208.07339)论文中了解更多关于量化方法的信息，或者在[博客文章](https://huggingface.co/blog/hf-bitsandbytes-integration)中了解关于合作的更多信息。
+
+自其“0.39.0”版本发布以来，您可以使用FP4数据类型，通过4位量化加载任何支持“device_map”的模型。
+
+如果您想量化自己的 pytorch 模型，请查看 🤗 Accelerate 的[文档](https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization)。
+
+以下是您可以使用“bitsandbytes”集成完成的事情
+
+### 通用用法
+
+只要您的模型支持使用 🤗 Accelerate 进行加载并包含 `torch.nn.Linear` 层，您可以在调用 [`~PreTrainedModel.from_pretrained`] 方法时使用 `load_in_8bit` 或 `load_in_4bit` 参数来量化模型。这也应该适用于任何模态。
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True)
+```
+
+默认情况下，所有其他模块（例如 `torch.nn.LayerNorm`）将被转换为 `torch.float16` 类型。但如果您想更改它们的 `dtype`，可以重载 `torch_dtype` 参数：
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM
+
+>>> model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
+>>> model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+torch.float32
+```
+
+
+### FP4 量化 
+
+#### 要求
+
+确保在运行以下代码段之前已完成以下要求：
+
+- 最新版本 `bitsandbytes` 库
+`pip install bitsandbytes>=0.39.0`
+
+- 安装最新版本 `accelerate`
+`pip install --upgrade accelerate`
+
+- 安装最新版本 `transformers`
+`pip install --upgrade transformers`
+
+#### 提示和最佳实践
+
+
+- **高级用法：** 请参考 [此 Google Colab notebook](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) 以获取 4 位量化高级用法和所有可选选项。
+
+- **使用 `batch_size=1` 实现更快的推理：** 自 `bitsandbytes` 的 `0.40.0` 版本以来，设置 `batch_size=1`，您可以从快速推理中受益。请查看 [这些发布说明](https://github.com/TimDettmers/bitsandbytes/releases/tag/0.40.0) ，并确保使用大于 `0.40.0` 的版本以直接利用此功能。
+
+- **训练：** 根据 [QLoRA 论文](https://arxiv.org/abs/2305.14314)，对于4位基模型训练（使用 LoRA 适配器），应使用 `bnb_4bit_quant_type='nf4'`。
+
+- **推理：** 对于推理，`bnb_4bit_quant_type` 对性能影响不大。但是为了与模型的权重保持一致，请确保使用相同的 `bnb_4bit_compute_dtype` 和 `torch_dtype` 参数。
+
+#### 加载 4 位量化的大模型
+
+在调用 `.from_pretrained` 方法时使用 `load_in_4bit=True`，可以将您的内存使用量减少到大约原来的 1/4。
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
+```
+
+<Tip warning={true}>
+
+需要注意的是，一旦模型以 4 位量化方式加载，就无法将量化后的权重推送到 Hub 上。此外，您不能训练 4 位量化权重，因为目前尚不支持此功能。但是，您可以使用 4 位量化模型来训练额外参数，这将在下一部分中介绍。
+
+</Tip>
+
+### 加载 8 位量化的大模型
+
+您可以通过在调用 `.from_pretrained` 方法时使用 `load_in_8bit=True` 参数，将内存需求大致减半来加载模型
+
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+```
+
+然后，像通常使用 `PreTrainedModel` 一样使用您的模型。
+
+您可以使用 `get_memory_footprint` 方法检查模型的内存占用。
+
+
+```python
+print(model.get_memory_footprint())
+```
+
+通过这种集成，我们能够在较小的设备上加载大模型并运行它们而没有任何问题。
+
+<Tip warning={true}>
+
+需要注意的是，一旦模型以 8 位量化方式加载，除了使用最新的 `transformers` 和 `bitsandbytes` 之外，目前尚无法将量化后的权重推送到 Hub 上。此外，您不能训练 8 位量化权重，因为目前尚不支持此功能。但是，您可以使用 8 位量化模型来训练额外参数，这将在下一部分中介绍。
+
+注意，`device_map` 是可选的，但设置 `device_map = 'auto'` 更适合用于推理，因为它将更有效地调度可用资源上的模型。
+
+
+</Tip>
+
+#### 高级用例
+
+在这里，我们将介绍使用 FP4 量化的一些高级用例。
+
+##### 更改计算数据类型
+
+计算数据类型用于改变在进行计算时使用的数据类型。例如，hidden states可以是 `float32`，但为了加速，计算时可以被设置为 `bf16`。默认情况下，计算数据类型被设置为 `float32`。
+
+
+```python
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+#### 使用 NF4（普通浮点数 4）数据类型
+
+您还可以使用 NF4 数据类型，这是一种针对使用正态分布初始化的权重而适应的新型 4 位数据类型。要运行：
+
+```python
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+#### 使用嵌套量化进行更高效的内存推理
+
+我们还建议用户使用嵌套量化技术。从我们的经验观察来看，这种方法在不增加额外性能的情况下节省更多内存。这使得 llama-13b 模型能够在具有 1024 个序列长度、1 个批次大小和 4 个梯度累积步骤的 NVIDIA-T4 16GB 上进行 fine-tuning。
+
+```python
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
+```
+
+### 将量化模型推送到🤗 Hub
+
+您可以使用 `push_to_hub` 方法将量化模型推送到 Hub 上。这将首先推送量化配置文件，然后推送量化模型权重。
+请确保使用 `bitsandbytes>0.37.2`（在撰写本文时，我们使用的是 `bitsandbytes==0.38.0.post1`）才能使用此功能。
+
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+<Tip warning={true}>
+
+对大模型，强烈鼓励将 8 位量化模型推送到 Hub 上，以便让社区能够从内存占用减少和加载中受益，例如在 Google Colab 上加载大模型。
+
+</Tip>
+
+### 从🤗 Hub加载量化模型
+
+您可以使用 `from_pretrained` 方法从 Hub 加载量化模型。请确保推送的权重是量化的，检查模型配置对象中是否存在 `quantization_config` 属性。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
+```
+
+请注意，在这种情况下，您不需要指定 `load_in_8bit=True` 参数，但需要确保 `bitsandbytes` 和 `accelerate` 已安装。
+情注意，`device_map` 是可选的，但设置 `device_map = 'auto'` 更适合用于推理，因为它将更有效地调度可用资源上的模型。
+
+### 高级用例
+
+本节面向希望探索除了加载和运行 8 位模型之外还能做什么的进阶用户。
+
+#### 在 `cpu` 和 `gpu` 之间卸载
+
+此高级用例之一是能够加载模型并将权重分派到 `CPU` 和 `GPU` 之间。请注意，将在 CPU 上分派的权重 **不会** 转换为 8 位，因此会保留为 `float32`。此功能适用于想要适应非常大的模型并将模型分派到 GPU 和 CPU 之间的用户。
+
+首先，从 `transformers` 中加载一个 [`BitsAndBytesConfig`]，并将属性 `llm_int8_enable_fp32_cpu_offload` 设置为 `True`：
+
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+```
+
+假设您想加载 `bigscience/bloom-1b7` 模型，您的 GPU显存仅足够容纳除了`lm_head`外的整个模型。因此，您可以按照以下方式编写自定义的 device_map：
+
+```python
+device_map = {
+    "transformer.word_embeddings": 0,
+    "transformer.word_embeddings_layernorm": 0,
+    "lm_head": "cpu",
+    "transformer.h": 0,
+    "transformer.ln_f": 0,
+}
+```
+
+然后如下加载模型：
+
+```python
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+这就是全部内容！享受您的模型吧！
+
+#### 使用`llm_int8_threshold`
+
+您可以使用 `llm_int8_threshold` 参数来更改异常值的阈值。“异常值”是一个大于特定阈值的`hidden state`值。
+这对应于`LLM.int8()`论文中描述的异常检测的异常阈值。任何高于此阈值的`hidden state`值都将被视为异常值，对这些值的操作将在 fp16 中完成。值通常是正态分布的，也就是说，大多数值在 [-3.5, 3.5] 范围内，但有一些额外的系统异常值，对于大模型来说，它们的分布非常不同。这些异常值通常在区间 [-60, -6] 或 [6, 60] 内。Int8 量化对于幅度为 ~5 的值效果很好，但超出这个范围，性能就会明显下降。一个好的默认阈值是 6，但对于更不稳定的模型（小模型、微调）可能需要更低的阈值。
+这个参数会影响模型的推理速度。我们建议尝试这个参数，以找到最适合您的用例的参数。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_threshold=10,
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### 跳过某些模块的转换
+
+一些模型有几个需要保持未转换状态以确保稳定性的模块。例如，Jukebox 模型有几个 `lm_head` 模块需要跳过。使用 `llm_int8_skip_modules` 参数进行相应操作。
+
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_skip_modules=["lm_head"],
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### 微调已加载为8位精度的模型
+
+借助Hugging Face生态系统中适配器（adapters）的官方支持，您可以在8位精度下微调模型。这使得可以在单个Google Colab中微调大模型，例如`flan-t5-large`或`facebook/opt-6.7b`。请查看[`peft`](https://github.com/huggingface/peft)库了解更多详情。
+
+注意，加载模型进行训练时无需传递`device_map`。它将自动将您的模型加载到GPU上。如果需要，您可以将设备映射为特定设备（例如`cuda:0`、`0`、`torch.device('cuda:0')`）。请注意，`device_map=auto`仅应用于推理。
+
+
+### BitsAndBytesConfig
+
+[[autodoc]] BitsAndBytesConfig
+
+
+## 使用 🤗 `optimum` 进行量化
+
+请查看[Optimum 文档](https://huggingface.co/docs/optimum/index)以了解更多关于`optimum`支持的量化方法，并查看这些方法是否适用于您的用例。
+
diff --git a/docs/source/zh/main_classes/text_generation.md b/docs/source/zh/main_classes/text_generation.md
new file mode 100644
index 00000000000000..773228832f2272
--- /dev/null
+++ b/docs/source/zh/main_classes/text_generation.md
@@ -0,0 +1,58 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generation
+
+每个框架都在它们各自的 `GenerationMixin` 类中实现了文本生成的 `generate` 方法：
+
+- PyTorch [`~generation.GenerationMixin.generate`] 在 [`~generation.GenerationMixin`] 中实现。
+- TensorFlow [`~generation.TFGenerationMixin.generate`] 在 [`~generation.TFGenerationMixin`] 中实现。
+- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] 在 [`~generation.FlaxGenerationMixin`] 中实现。
+
+无论您选择哪个框架，都可以使用 [`~generation.GenerationConfig`] 类实例对 generate 方法进行参数化。有关生成方法的控制参数的完整列表，请参阅此类。
+
+要了解如何检查模型的生成配置、默认值是什么、如何临时更改参数以及如何创建和保存自定义生成配置，请参阅 [文本生成策略指南](../generation_strategies)。该指南还解释了如何使用相关功能，如token流。
+
+## GenerationConfig
+
+[[autodoc]] generation.GenerationConfig
+	- from_pretrained
+	- from_model_config
+	- save_pretrained
+
+## GenerationMixin
+
+[[autodoc]] generation.GenerationMixin
+	- generate
+	- compute_transition_scores
+	- greedy_search
+	- sample
+	- beam_search
+	- beam_sample
+	- contrastive_search
+	- group_beam_search
+	- constrained_beam_search
+
+## TFGenerationMixin
+
+[[autodoc]] generation.TFGenerationMixin
+	- generate
+	- compute_transition_scores
+
+## FlaxGenerationMixin
+
+[[autodoc]] generation.FlaxGenerationMixin
+	- generate
diff --git a/docs/source/zh/main_classes/tokenizer.md b/docs/source/zh/main_classes/tokenizer.md
new file mode 100644
index 00000000000000..f89fc20b53d1d9
--- /dev/null
+++ b/docs/source/zh/main_classes/tokenizer.md
@@ -0,0 +1,65 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tokenizer
+
+tokenizer负责准备输入以供模型使用。该库包含所有模型的tokenizer。大多数tokenizer都有两种版本：一个是完全的 Python 实现，另一个是基于 Rust 库 [🤗 Tokenizers](https://github.com/huggingface/tokenizers) 的“Fast”实现。"Fast" 实现允许：
+
+1. 在批量分词时显著提速
+2. 在原始字符串（字符和单词）和token空间之间进行映射的其他方法（例如，获取包含给定字符的token的索引或与给定token对应的字符范围）。
+
+基类 [PreTrainedTokenizer] 和 [PreTrained TokenizerFast] 实现了在模型输入中编码字符串输入的常用方法（见下文），并从本地文件或目录或从库提供的预训练的 tokenizer（从 HuggingFace 的 AWS S3 存储库下载）实例化/保存 python 和“Fast” tokenizer。它们都依赖于包含常用方法的 [`~tokenization_utils_base.PreTrainedTokenizerBase`]和[`~tokenization_utils_base.SpecialTokensMixin`]。
+
+因此，[`PreTrainedTokenizer`] 和 [`PreTrainedTokenizerFast`] 实现了使用所有tokenizers的主要方法：
+
+- 分词（将字符串拆分为子词标记字符串），将tokens字符串转换为id并转换回来，以及编码/解码（即标记化并转换为整数）。
+- 以独立于底层结构（BPE、SentencePiece……）的方式向词汇表中添加新tokens。
+- 管理特殊tokens（如mask、句首等）：添加它们，将它们分配给tokenizer中的属性以便于访问，并确保它们在标记过程中不会被分割。
+
+[`BatchEncoding`] 包含 [`~tokenization_utils_base.PreTrainedTokenizerBase`] 的编码方法（`__call__`、`encode_plus` 和 `batch_encode_plus`）的输出，并且是从 Python 字典派生的。当tokenizer是纯 Python tokenizer时，此类的行为就像标准的 Python 字典一样，并保存这些方法计算的各种模型输入（`input_ids`、`attention_mask` 等）。当分词器是“Fast”分词器时（即由 HuggingFace 的 [tokenizers 库](https://github.com/huggingface/tokenizers) 支持），此类还提供了几种高级对齐方法，可用于在原始字符串（字符和单词）与token空间之间进行映射（例如，获取包含给定字符的token的索引或与给定token对应的字符范围）。
+
+
+## PreTrainedTokenizer
+
+[[autodoc]] PreTrainedTokenizer
+    - __call__
+    - add_tokens
+    - add_special_tokens
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## PreTrainedTokenizerFast
+
+[`PreTrainedTokenizerFast`] 依赖于 [tokenizers](https://huggingface.co/docs/tokenizers) 库。可以非常简单地将从 🤗 tokenizers 库获取的tokenizers加载到 🤗 transformers 中。查看 [使用 🤗 tokenizers 的分词器](../fast_tokenizers) 页面以了解如何执行此操作。
+
+[[autodoc]] PreTrainedTokenizerFast
+    - __call__
+    - add_tokens
+    - add_special_tokens
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## BatchEncoding
+
+[[autodoc]] BatchEncoding
diff --git a/docs/source/zh/main_classes/trainer.md b/docs/source/zh/main_classes/trainer.md
new file mode 100644
index 00000000000000..049a3724114bd2
--- /dev/null
+++ b/docs/source/zh/main_classes/trainer.md
@@ -0,0 +1,665 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer
+
+[`Trainer`] 类提供了一个 PyTorch 的 API，用于处理大多数标准用例的全功能训练。它在大多数[示例脚本](https://github.com/huggingface/transformers/tree/main/examples)中被使用。
+
+<Tip>
+
+如果你想要使用自回归技术在文本数据集上微调像 Llama-2 或 Mistral 这样的语言模型，考虑使用 [`trl`](https://github.com/huggingface/trl) 的 [`~trl.SFTTrainer`]。[`~trl.SFTTrainer`] 封装了 [`Trainer`]，专门针对这个特定任务进行了优化，并支持序列打包、LoRA、量化和 DeepSpeed，以有效扩展到任何模型大小。另一方面，[`Trainer`] 是一个更通用的选项，适用于更广泛的任务。
+
+</Tip>
+
+在实例化你的 [`Trainer`] 之前，创建一个 [`TrainingArguments`]，以便在训练期间访问所有定制点。
+
+这个 API 支持在多个 GPU/TPU 上进行分布式训练，支持 [NVIDIA Apex](https://github.com/NVIDIA/apex) 的混合精度和 PyTorch 的原生 AMP。
+
+[`Trainer`] 包含基本的训练循环，支持上述功能。如果需要自定义训练，你可以继承 `Trainer` 并覆盖以下方法：
+
+- **get_train_dataloader** -- 创建训练 DataLoader。
+- **get_eval_dataloader** -- 创建评估 DataLoader。
+- **get_test_dataloader** -- 创建测试 DataLoader。
+- **log** -- 记录观察训练的各种对象的信息。
+- **create_optimizer_and_scheduler** -- 如果它们没有在初始化时传递，请设置优化器和学习率调度器。请注意，你还可以单独继承或覆盖 `create_optimizer` 和 `create_scheduler` 方法。
+- **create_optimizer** -- 如果在初始化时没有传递，则设置优化器。
+- **create_scheduler** -- 如果在初始化时没有传递，则设置学习率调度器。
+- **compute_loss** - 计算单批训练输入的损失。
+- **training_step** -- 执行一步训练。
+- **prediction_step** -- 执行一步评估/测试。
+- **evaluate** -- 运行评估循环并返回指标。
+- **predict** -- 返回在测试集上的预测（如果有标签，则包括指标）。
+
+<Tip warning={true}>
+
+[`Trainer`] 类被优化用于 🤗 Transformers 模型，并在你在其他模型上使用时可能会有一些令人惊讶的结果。当在你自己的模型上使用时，请确保：
+
+- 你的模型始终返回元组或 [`~utils.ModelOutput`] 的子类。
+- 如果提供了 `labels` 参数，你的模型可以计算损失，并且损失作为元组的第一个元素返回（如果你的模型返回元组）。
+- 你的模型可以接受多个标签参数（在 [`TrainingArguments`] 中使用 `label_names` 将它们的名称指示给 [`Trainer`]），但它们中没有一个应该被命名为 `"label"`。
+
+</Tip>
+
+以下是如何自定义 [`Trainer`] 以使用加权损失的示例（在训练集不平衡时很有用）：
+
+```python
+from torch import nn
+from transformers import Trainer
+
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss (suppose one has 3 labels with different weights)
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+在 PyTorch [`Trainer`] 中自定义训练循环行为的另一种方法是使用 [callbacks](callback)，这些回调可以检查训练循环状态（用于进度报告、在 TensorBoard 或其他 ML 平台上记录日志等）并做出决策（比如提前停止）。
+
+
+## Trainer
+
+[[autodoc]] Trainer - all
+
+## Seq2SeqTrainer
+
+[[autodoc]] Seq2SeqTrainer - evaluate - predict
+
+## TrainingArguments
+
+[[autodoc]] TrainingArguments - all
+
+## Seq2SeqTrainingArguments
+
+[[autodoc]] Seq2SeqTrainingArguments - all
+
+## Checkpoints
+
+默认情况下，[`Trainer`] 会将所有checkpoints保存在你使用的 [`TrainingArguments`] 中设置的 `output_dir` 中。这些checkpoints将位于名为 `checkpoint-xxx` 的子文件夹中，xxx 是训练的步骤。
+
+从checkpoints恢复训练可以通过调用 [`Trainer.train`] 时使用以下任一方式进行：
+
+- `resume_from_checkpoint=True`，这将从最新的checkpoint恢复训练。
+- `resume_from_checkpoint=checkpoint_dir`，这将从指定目录中的特定checkpoint恢复训练。
+
+此外，当使用 `push_to_hub=True` 时，你可以轻松将checkpoints保存在 Model Hub 中。默认情况下，保存在训练中间过程的checkpoints中的所有模型都保存在不同的提交中，但不包括优化器状态。你可以根据需要调整 [`TrainingArguments`] 的 `hub-strategy` 值：
+
+- `"checkpoint"`: 最新的checkpoint也被推送到一个名为 last-checkpoint 的子文件夹中，让你可以通过 `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")` 轻松恢复训练。
+- `"all_checkpoints"`: 所有checkpoints都像它们出现在输出文件夹中一样被推送（因此你将在最终存储库中的每个文件夹中获得一个checkpoint文件夹）。
+
+## Logging
+
+默认情况下，[`Trainer`] 将对主进程使用 `logging.INFO`，对副本（如果有的话）使用 `logging.WARNING`。
+
+可以通过 [`TrainingArguments`] 的参数覆盖这些默认设置，使用其中的 5 个 `logging` 级别：
+
+- `log_level` - 用于主进程
+- `log_level_replica` - 用于副本
+
+此外，如果 [`TrainingArguments`] 的 `log_on_each_node` 设置为 `False`，则只有主节点将使用其主进程的日志级别设置，所有其他节点将使用副本的日志级别设置。
+
+请注意，[`Trainer`] 将在其 [`Trainer.__init__`] 中分别为每个节点设置 `transformers` 的日志级别。因此，如果在创建 [`Trainer`] 对象之前要调用其他 `transformers` 功能，可能需要更早地设置这一点（请参见下面的示例）。
+
+以下是如何在应用程序中使用的示例：
+
+```python
+[...]
+logger = logging.getLogger(__name__)
+
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+# set the main code and the modules it uses to the same log-level according to the node
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+然后，如果你只想在主节点上看到警告，并且所有其他节点不打印任何可能重复的警告，可以这样运行：
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+在多节点环境中，如果你也不希望每个节点的主进程的日志重复输出，你需要将上面的代码更改为：
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+```
+
+然后，只有第一个节点的主进程将以 "warning" 级别记录日志，主节点上的所有其他进程和其他节点上的所有进程将以 "error" 级别记录日志。
+
+如果你希望应用程序尽可能”安静“，可以执行以下操作：
+
+
+```bash
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+(如果在多节点环境，添加 `--log_on_each_node 0`)
+
+
+## 随机性
+
+当从 [`Trainer`] 生成的checkpoint恢复训练时，程序会尽一切努力将 _python_、_numpy_ 和 _pytorch_ 的 RNG（随机数生成器）状态恢复为保存检查点时的状态，这样可以使“停止和恢复”式训练尽可能接近“非停止式”训练。
+
+然而，由于各种默认的非确定性 PyTorch 设置，这可能无法完全实现。如果你想要完全确定性，请参阅[控制随机源](https://pytorch.org/docs/stable/notes/randomness)。正如文档中所解释的那样，使事物变得确定的一些设置（例如 `torch.backends.cudnn.deterministic`）可能会减慢速度，因此不能默认执行，但如果需要，你可以自行启用这些设置。
+
+
+## 特定GPU选择
+
+让我们讨论一下如何告诉你的程序应该使用哪些 GPU 以及使用的顺序。
+
+当使用 [`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) 且仅使用部分 GPU 时，你只需指定要使用的 GPU 数量。例如，如果你有 4 个 GPU，但只想使用前 2 个，可以执行以下操作：
+
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
+```
+
+如果你安装了 [`accelerate`](https://github.com/huggingface/accelerate) 或 [`deepspeed`](https://github.com/microsoft/DeepSpeed)，你还可以通过以下任一方法实现相同的效果：
+
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+你不需要使用 Accelerate 或 [Deepspeed 集成](Deepspeed) 功能来使用这些启动器。
+
+到目前为止，你已经能够告诉程序要使用多少个 GPU。现在让我们讨论如何选择特定的 GPU 并控制它们的顺序。
+
+以下环境变量可帮助你控制使用哪些 GPU 以及它们的顺序。
+
+
+**`CUDA_VISIBLE_DEVICES`**
+
+如果你有多个 GPU，想要仅使用其中的一个或几个 GPU，请将环境变量 `CUDA_VISIBLE_DEVICES` 设置为要使用的 GPU 列表。
+
+例如，假设你有 4 个 GPU：0、1、2 和 3。要仅在物理 GPU 0 和 2 上运行，你可以执行以下操作：
+
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
+```
+
+现在，PyTorch 将只看到 2 个 GPU，其中你的物理 GPU 0 和 2 分别映射到 `cuda:0` 和 `cuda:1`。
+
+你甚至可以改变它们的顺序：
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
+```
+
+这里，你的物理 GPU 0 和 2 分别映射到 `cuda:1` 和 `cuda:0`。
+
+上面的例子都是针对 `DistributedDataParallel` 使用模式的，但同样的方法也适用于 [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html)：
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
+```
+
+为了模拟没有 GPU 的环境，只需将此环境变量设置为空值，如下所示：
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+与任何环境变量一样，你当然可以将其export到环境变量而不是将其添加到命令行，如下所示：
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,2
+python -m torch.distributed.launch trainer-program.py ...
+```
+
+这种方法可能会令人困惑，因为你可能会忘记之前设置了环境变量，进而不明白为什么会使用错误的 GPU。因此，在同一命令行中仅为特定运行设置环境变量是一种常见做法，正如本节大多数示例所示。
+
+
+**`CUDA_DEVICE_ORDER`**
+
+还有一个额外的环境变量 `CUDA_DEVICE_ORDER`，用于控制物理设备的排序方式。有两个选择：
+
+1. 按 PCIe 总线 ID 排序（与 nvidia-smi 的顺序相匹配）- 这是默认选项。
+
+
+```bash
+export CUDA_DEVICE_ORDER=PCI_BUS_ID
+```
+
+2. 按 GPU 计算能力排序。
+
+```bash
+export CUDA_DEVICE_ORDER=FASTEST_FIRST
+```
+
+大多数情况下，你不需要关心这个环境变量，但如果你的设置不均匀，那么这将非常有用，例如，您的旧 GPU 和新 GPU 物理上安装在一起，但让速度较慢的旧卡排在运行的第一位。解决这个问题的一种方法是交换卡的位置。但如果不能交换卡（例如，如果设备的散热受到影响），那么设置 `CUDA_DEVICE_ORDER=FASTEST_FIRST` 将始终将较新、更快的卡放在第一位。但这可能会有点混乱，因为 `nvidia-smi` 仍然会按照 PCIe 顺序报告它们。
+
+交换卡的顺序的另一种方法是使用：
+
+
+```bash
+export CUDA_VISIBLE_DEVICES=1,0
+```
+
+在此示例中，我们只使用了 2 个 GPU，但是当然，对于计算机上有的任何数量的 GPU，都适用相同的方法。
+
+此外，如果你设置了这个环境变量，最好将其设置在 `~/.bashrc` 文件或其他启动配置文件中，然后就可以忘记它了。
+
+
+## Trainer集成
+
+[`Trainer`] 已经被扩展，以支持可能显著提高训练时间并适应更大模型的库。
+
+目前，它支持第三方解决方案 [DeepSpeed](https://github.com/microsoft/DeepSpeed) 和 [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html)，它们实现了论文 [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He](https://arxiv.org/abs/1910.02054) 的部分内容。
+
+截至撰写本文，此提供的支持是新的且实验性的。尽管我们欢迎围绕 DeepSpeed 和 PyTorch FSDP 的issues，但我们不再支持 FairScale 集成，因为它已经集成到了 PyTorch 主线（参见 [PyTorch FSDP 集成](#pytorch-fully-sharded-data-parallel)）。
+
+
+<a id='zero-install-notes'></a>
+
+### CUDA拓展安装注意事项
+
+
+撰写时，Deepspeed 需要在使用之前编译 CUDA C++ 代码。
+
+虽然所有安装问题都应通过 [Deepspeed](https://github.com/microsoft/DeepSpeed/issues) 的 GitHub Issues处理，但在构建依赖CUDA 扩展的任何 PyTorch 扩展时，可能会遇到一些常见问题。
+
+因此，如果在执行以下操作时遇到与 CUDA 相关的构建问题：
+
+
+```bash
+pip install deepspeed
+```
+
+请首先阅读以下说明。
+
+在这些说明中，我们提供了在 `pytorch` 使用 CUDA `10.2` 构建时应采取的操作示例。如果你的情况有所不同，请记得将版本号调整为您所需的版本。
+
+
+#### 可能的问题 #1
+
+尽管 PyTorch 自带了其自己的 CUDA 工具包，但要构建这两个项目，你必须在整个系统上安装相同版本的 CUDA。
+
+例如，如果你在 Python 环境中使用 `cudatoolkit==10.2` 安装了 `pytorch`，你还需要在整个系统上安装 CUDA `10.2`。
+
+确切的位置可能因系统而异，但在许多 Unix 系统上，`/usr/local/cuda-10.2` 是最常见的位置。当 CUDA 正确设置并添加到 `PATH` 环境变量时，可以通过执行以下命令找到安装位置：
+
+
+```bash
+which nvcc
+```
+
+如果你尚未在整个系统上安装 CUDA，请首先安装。你可以使用你喜欢的搜索引擎查找说明。例如，如果你使用的是 Ubuntu，你可能想搜索：[ubuntu cuda 10.2 install](https://www.google.com/search?q=ubuntu+cuda+10.2+install)。
+
+
+#### 可能的问题 #2
+
+另一个可能的常见问题是你可能在整个系统上安装了多个 CUDA 工具包。例如，你可能有：
+
+
+```bash
+/usr/local/cuda-10.2
+/usr/local/cuda-11.0
+```
+
+在这种情况下，你需要确保 `PATH` 和 `LD_LIBRARY_PATH` 环境变量包含所需 CUDA 版本的正确路径。通常，软件包安装程序将设置这些变量以包含最新安装的版本。如果遇到构建失败的问题，且是因为在整个系统安装但软件仍找不到正确的 CUDA 版本，这意味着你需要调整这两个环境变量。
+
+首先，你以查看它们的内容：
+
+
+```bash
+echo $PATH
+echo $LD_LIBRARY_PATH
+```
+
+因此，您可以了解其中的内容。
+
+`LD_LIBRARY_PATH` 可能是空的。
+
+`PATH` 列出了可以找到可执行文件的位置，而 `LD_LIBRARY_PATH` 用于查找共享库。在这两种情况下，较早的条目优先于较后的条目。 `:` 用于分隔多个条目。
+
+现在，为了告诉构建程序在哪里找到特定的 CUDA 工具包，请插入所需的路径，让其首先列出：
+
+
+```bash
+export PATH=/usr/local/cuda-10.2/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
+```
+
+请注意，我们没有覆盖现有值，而是在前面添加新的值。
+
+当然，根据需要调整版本号和完整路径。检查你分配的目录是否实际存在。`lib64` 子目录是各种 CUDA `.so` 对象（如 `libcudart.so`）的位置，这个名字可能在你的系统中是不同的，如果是，请调整以反映实际情况。
+
+
+#### 可能的问题 #3
+
+一些较旧的 CUDA 版本可能会拒绝使用更新的编译器。例如，你可能有 `gcc-9`，但 CUDA 可能需要 `gcc-7`。
+
+有各种方法可以解决这个问题。
+
+如果你可以安装最新的 CUDA 工具包，通常它应该支持更新的编译器。
+
+或者，你可以在已经拥有的编译器版本之外安装较低版本，或者你可能已经安装了它但它不是默认的编译器，因此构建系统无法找到它。如果你已经安装了 `gcc-7` 但构建系统找不到它，以下操作可能会解决问题：
+
+
+```bash
+sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
+sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
+```
+
+这里，我们正在从 `/usr/local/cuda-10.2/bin/gcc` 创建到 `gcc-7` 的软链接，由于 `/usr/local/cuda-10.2/bin/` 应该在 `PATH` 环境变量中（参见前一个问题的解决方案），它应该能够找到 `gcc-7`（和 `g++7`），然后构建将成功。
+
+与往常一样，请确保编辑示例中的路径以匹配你的情况。
+
+
+
+### PyTorch完全分片数据并行（FSDP)
+
+为了加速在更大批次大小上训练庞大模型，我们可以使用完全分片的数据并行模型。这种数据并行范例通过对优化器状态、梯度和参数进行分片，实现了在更多数据和更大模型上的训练。要了解更多信息以及其优势，请查看[完全分片的数据并行博客](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/)。我们已经集成了最新的PyTorch完全分片的数据并行（FSDP）训练功能。您只需通过配置启用它。
+
+**FSDP支持所需的PyTorch版本**: PyTorch Nightly（或者如果你在发布后阅读这个，使用1.12.0版本，因为带有激活的FSDP的模型保存仅在最近的修复中可用。
+
+
+**用法**:
+
+- 如果你尚未使用过分布式启动器，确保你已经添加了它 `-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`。
+
+- **分片策略**：
+  - FULL_SHARD：在数据并行线程/GPU之间，对优化器状态、梯度和模型参数进行分片。
+    为此，请在命令行参数中添加 `--fsdp full_shard`。
+  - SHARD_GRAD_OP：在数据并行线程/GPU之间对优化器状态和梯度进行分片。
+    为此，请在命令行参数中添加 `--fsdp shard_grad_op`。
+  - NO_SHARD：不进行分片。为此，请在命令行参数中添加 `--fsdp no_shard`。
+- 要将参数和梯度卸载到CPU，添加 `--fsdp "full_shard offload"` 或 `--fsdp "shard_grad_op offload"` 到命令行参数中。
+- 要使用 `default_auto_wrap_policy` 自动递归地用FSDP包装层，请添加 `--fsdp "full_shard auto_wrap"` 或 `--fsdp "shard_grad_op auto_wrap"` 到命令行参数中。
+- 要同时启用CPU卸载和自动包装层工具，请添加 `--fsdp "full_shard offload auto_wrap"` 或 `--fsdp "shard_grad_op offload auto_wrap"` 到命令行参数中。
+- 其余的FSDP配置通过 `--fsdp_config <path_to_fsdp_config.json>` 传递。它可以是FSDP json配置文件的位置（例如，`fsdp_config.json`）或已加载的json文件作为 `dict`。
+  - 如果启用了自动包装，您可以使用基于transformer的自动包装策略或基于大小的自动包装策略。
+    - 对于基于transformer的自动包装策略，建议在配置文件中指定 `fsdp_transformer_layer_cls_to_wrap`。如果未指定，则默认值为 `model._no_split_modules`（如果可用）。这将指定要包装的transformer层类名（区分大小写），例如 [`BertLayer`]、[`GPTJBlock`]、[`T5Block`] 等。这很重要，因为共享权重的子模块（例如，embedding层）不应最终出现在不同的FSDP包装单元中。使用此策略，每个包装的块将包含多头注意力和后面的几个MLP层。剩余的层，包括共享的embedding层，都将被方便地包装在同一个最外层的FSDP单元中。因此，对于基于transformer的模型，请使用这个方法。
+    - 对于基于大小的自动包装策略，请在配置文件中添加 `fsdp_min_num_params`。它指定了FSDP进行自动包装的最小参数数量。
+  - 可以在配置文件中指定 `fsdp_backward_prefetch`。它控制何时预取下一组参数。`backward_pre` 和 `backward_pos` 是可用的选项。有关更多信息，请参阅 `torch.distributed.fsdp.fully_sharded_data_parallel.BackwardPrefetch`
+  - 可以在配置文件中指定 `fsdp_forward_prefetch`。它控制何时预取下一组参数。如果是`"True"`，在执行前向传递时，FSDP明确地预取下一次即将发生的全局聚集。
+  - 可以在配置文件中指定 `limit_all_gathers`。如果是`"True"`，FSDP明确地同步CPU线程，以防止太多的进行中的全局聚集。
+  - 可以在配置文件中指定 `activation_checkpointing`。如果是`"True"`，FSDP activation checkpoint是一种通过清除某些层的激活值并在反向传递期间重新计算它们来减少内存使用的技术。实际上，这以更多的计算时间为代价减少了内存使用。
+
+
+**需要注意几个注意事项**
+- 它与 `generate` 不兼容，因此与所有seq2seq/clm脚本（翻译/摘要/clm等）中的 `--predict_with_generate` 不兼容。请参阅issue[#21667](https://github.com/huggingface/transformers/issues/21667)。
+
+
+### PyTorch/XLA 完全分片数据并行
+
+对于所有TPU用户，有个好消息！PyTorch/XLA现在支持FSDP。所有最新的完全分片数据并行（FSDP）训练都受支持。有关更多信息，请参阅[在云端TPU上使用FSDP扩展PyTorch模型](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/)和[PyTorch/XLA FSDP的实现](https://github.com/pytorch/xla/tree/master/torch_xla/distributed/fsdp)。使用它只需通过配置启用。
+
+**需要的 PyTorch/XLA 版本以支持 FSDP**：>=2.0
+
+**用法**：
+
+传递 `--fsdp "full shard"`，同时对 `--fsdp_config <path_to_fsdp_config.json>` 进行以下更改：
+- `xla` 应设置为 `True` 以启用 PyTorch/XLA FSDP。
+- `xla_fsdp_settings` 的值是一个字典，存储 XLA FSDP 封装参数。完整的选项列表，请参见[此处](https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py)。
+- `xla_fsdp_grad_ckpt`。当 `True` 时，在每个嵌套的 XLA FSDP 封装层上使用梯度checkpoint。该设置只能在将 xla 标志设置为 true，并通过 `fsdp_min_num_params` 或 `fsdp_transformer_layer_cls_to_wrap` 指定自动包装策略时使用。
+- 您可以使用基于transformer的自动包装策略或基于大小的自动包装策略。
+  - 对于基于transformer的自动包装策略，建议在配置文件中指定 `fsdp_transformer_layer_cls_to_wrap`。如果未指定，默认值为 `model._no_split_modules`（如果可用）。这指定了要包装的transformer层类名列表（区分大小写），例如 [`BertLayer`]、[`GPTJBlock`]、[`T5Block`] 等。这很重要，因为共享权重的子模块（例如，embedding层）不应最终出现在不同的FSDP包装单元中。使用此策略，每个包装的块将包含多头注意力和后面的几个MLP层。剩余的层，包括共享的embedding层，都将被方便地包装在同一个最外层的FSDP单元中。因此，对于基于transformer的模型，请使用这个方法。
+  - 对于基于大小的自动包装策略，请在配置文件中添加 `fsdp_min_num_params`。它指定了自动包装的 FSDP 的最小参数数量。
+
+
+### 在 Mac 上使用 Trainer 进行加速的 PyTorch 训练
+
+随着 PyTorch v1.12 版本的发布，开发人员和研究人员可以利用 Apple Silicon GPU 进行显著更快的模型训练。这使得可以在 Mac 上本地执行原型设计和微调等机器学习工作流程。Apple 的 Metal Performance Shaders（MPS）作为 PyTorch 的后端实现了这一点，并且可以通过新的 `"mps"` 设备来使用。
+这将在 MPS 图形框架上映射计算图和神经图元，并使用 MPS 提供的优化内核。更多信息，请参阅官方文档 [Introducing Accelerated PyTorch Training on Mac](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) 和 [MPS BACKEND](https://pytorch.org/docs/stable/notes/mps.html)。
+
+
+<Tip warning={false}>
+
+我们强烈建议在你的 MacOS 机器上安装 PyTorch >= 1.13（在撰写本文时为最新版本）。对于基于 transformer 的模型， 它提供与模型正确性和性能改进相关的重大修复。有关更多详细信息，请参阅[pytorch/pytorch#82707](https://github.com/pytorch/pytorch/issues/82707)。
+
+</Tip>
+
+**使用 Apple Silicon 芯片进行训练和推理的好处**
+
+1. 使用户能够在本地训练更大的网络或批量数据。
+2. 由于统一内存架构，减少数据检索延迟，并为 GPU 提供对完整内存存储的直接访问。从而提高端到端性能。
+3. 降低与基于云的开发或需要额外本地 GPU 的成本。
+
+**先决条件**：要安装带有 mps 支持的 torch，请按照这篇精彩的 Medium 文章操作 [GPU-Acceleration Comes to PyTorch on M1 Macs](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1)。
+
+**用法**：
+如果可用，`mps` 设备将默认使用，类似于使用 `cuda` 设备的方式。因此，用户无需采取任何操作。例如，您可以使用以下命令在 Apple Silicon GPU 上运行官方的 Glue 文本分类任务（从根文件夹运行）：
+
+```bash
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --overwrite_output_dir
+```
+
+**需要注意的一些注意事项**
+
+1. 一些 PyTorch 操作尚未在 mps 中实现，将引发错误。解决此问题的一种方法是设置环境变量 `PYTORCH_ENABLE_MPS_FALLBACK=1`，它将把这些操作回退到 CPU 进行。然而，它仍然会抛出 UserWarning 信息。
+2. 分布式设置 `gloo` 和 `nccl` 在 `mps` 设备上不起作用。这意味着当前只能使用 `mps` 设备类型的单个 GPU。
+
+最后，请记住，🤗 `Trainer` 仅集成了 MPS 后端，因此如果你在使用 MPS 后端时遇到任何问题或有疑问，请在 [PyTorch GitHub](https://github.com/pytorch/pytorch/issues) 上提交问题。
+
+
+## 通过 Accelerate Launcher 使用 Trainer
+
+Accelerate 现在支持 Trainer。用户可以期待以下内容：
+- 他们可以继续使用 Trainer 的迭代，如 FSDP、DeepSpeed 等，而无需做任何更改。
+- 现在可以在 Trainer 中使用 Accelerate Launcher（建议使用）。
+
+通过 Accelerate Launcher 使用 Trainer 的步骤：
+1. 确保已安装 🤗 Accelerate，无论如何，如果没有它，你无法使用 `Trainer`。如果没有，请执行 `pip install accelerate`。你可能还需要更新 Accelerate 的版本：`pip install accelerate --upgrade`。
+2. 运行 `accelerate config` 并填写问题。以下是一些加速配置的示例：
+   
+  a. DDP 多节点多 GPU 配置：
+
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    distributed_type: MULTI_GPU                                                                                                    
+    downcast_bf16: 'no'
+    gpu_ids: all
+    machine_rank: 0 #change rank as per the node
+    main_process_ip: 192.168.20.1
+    main_process_port: 9898
+    main_training_function: main
+    mixed_precision: fp16
+    num_machines: 2
+    num_processes: 8
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  b. FSDP 配置：
+
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    distributed_type: FSDP
+    downcast_bf16: 'no'
+    fsdp_config:
+      fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+      fsdp_backward_prefetch_policy: BACKWARD_PRE
+      fsdp_forward_prefetch: true
+      fsdp_offload_params: false
+      fsdp_sharding_strategy: 1
+      fsdp_state_dict_type: FULL_STATE_DICT
+      fsdp_sync_module_states: true
+      fsdp_transformer_layer_cls_to_wrap: BertLayer
+      fsdp_use_orig_params: true
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 2
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+ 
+  c. 指向文件的 DeepSpeed 配置：
+
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    deepspeed_config:
+      deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+      zero3_init_flag: true
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  d. 使用 accelerate 插件的 DeepSpeed 配置：
+
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    deepspeed_config:                                                                                                              
+      gradient_accumulation_steps: 1
+      gradient_clipping: 0.7
+      offload_optimizer_device: cpu
+      offload_param_device: cpu
+      zero3_init_flag: true
+      zero_stage: 2
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+3. 使用accelerate配置文件参数或启动器参数以外的参数运行Trainer脚本。以下是一个使用上述FSDP配置从accelerate启动器运行`run_glue.py`的示例。
+
+```bash
+cd transformers
+
+accelerate launch \
+./examples/pytorch/text-classification/run_glue.py \
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+4. 你也可以直接使用`accelerate launch`的cmd参数。上面的示例将映射到：
+
+```bash
+cd transformers
+
+accelerate launch --num_processes=2 \
+--use_fsdp \
+--mixed_precision=bf16 \
+--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+--fsdp_sharding_strategy=1 \
+--fsdp_state_dict_type=FULL_STATE_DICT \
+./examples/pytorch/text-classification/run_glue.py
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+有关更多信息，请参阅 🤗 Accelerate CLI 指南：[启动您的 🤗 Accelerate 脚本](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。
+
+已移动的部分：
+
+[ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a> | <a href="./deepspeed#deepspeed-installation">Installation</a><a id="installation"></a> | <a href="./deepspeed#deepspeed-multi-gpu">Deployment with multiple GPUs</a><a id="deployment-with-multiple-gpus"></a> | <a href="./deepspeed#deepspeed-one-gpu">Deployment with one GPU</a><a id="deployment-with-one-gpu"></a> | <a href="./deepspeed#deepspeed-notebook">Deployment in Notebooks</a><a id="deployment-in-notebooks"></a> | <a href="./deepspeed#deepspeed-config">Configuration</a><a id="configuration"></a> | <a href="./deepspeed#deepspeed-config-passing">Passing Configuration</a><a id="passing-configuration"></a> | <a href="./deepspeed#deepspeed-config-shared">Shared Configuration</a><a id="shared-configuration"></a> | <a href="./deepspeed#deepspeed-zero">ZeRO</a><a id="zero"></a> | <a href="./deepspeed#deepspeed-zero2-config">ZeRO-2 Config</a><a id="zero-2-config"></a> | <a href="./deepspeed#deepspeed-zero3-config">ZeRO-3 Config</a><a id="zero-3-config"></a> | <a href="./deepspeed#deepspeed-nvme">NVMe Support</a><a id="nvme-support"></a> | <a href="./deepspeed#deepspeed-zero2-zero3-performance">ZeRO-2 vs ZeRO-3 Performance</a><a id="zero-2-vs-zero-3-performance"></a> | <a href="./deepspeed#deepspeed-zero2-example">ZeRO-2 Example</a><a id="zero-2-example"></a> | <a href="./deepspeed#deepspeed-zero3-example">ZeRO-3 Example</a><a id="zero-3-example"></a> | <a href="./deepspeed#deepspeed-optimizer">Optimizer</a><a id="optimizer"></a> | <a href="./deepspeed#deepspeed-scheduler">Scheduler</a><a id="scheduler"></a> | <a href="./deepspeed#deepspeed-fp32">fp32 Precision</a><a id="fp32-precision"></a> | <a href="./deepspeed#deepspeed-amp">Automatic Mixed Precision</a><a id="automatic-mixed-precision"></a> | <a href="./deepspeed#deepspeed-bs">Batch Size</a><a id="batch-size"></a> | <a href="./deepspeed#deepspeed-grad-acc">Gradient Accumulation</a><a id="gradient-accumulation"></a> | <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a> | <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>]
+
+
+## 通过 NEFTune 提升微调性能
+
+NEFTune 是一种提升聊天模型性能的技术，由 Jain 等人在论文“NEFTune: Noisy Embeddings Improve Instruction Finetuning” 中引入。该技术在训练过程中向embedding向量添加噪音。根据论文摘要：
+
+> 使用 Alpaca 对 LLaMA-2-7B 进行标准微调，可以在 AlpacaEval 上达到 29.79%，而使用带有噪音embedding的情况下，性能提高至 64.69%。NEFTune 还在modern instruction数据集上大大优于基线。Evol-Instruct 训练的模型表现提高了 10%，ShareGPT 提高了 8%，OpenPlatypus 提高了 8%。即使像 LLaMA-2-Chat 这样通过 RLHF 进一步细化的强大模型，通过 NEFTune 的额外训练也能受益。
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
+</div>
+
+要在 `Trainer` 中使用它，只需在创建 `TrainingArguments` 实例时传递 `neftune_noise_alpha`。请注意，为了避免任何意外行为，NEFTune在训练后被禁止，以此恢复原始的embedding层。
+
+```python
+from transformers import Trainer, TrainingArguments
+
+args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=args)
+
+...
+
+trainer.train()
+```
diff --git a/docs/source/zh/perf_hardware.md b/docs/source/zh/perf_hardware.md
index f49e9a582963c9..ce7ab36151bfbe 100644
--- a/docs/source/zh/perf_hardware.md
+++ b/docs/source/zh/perf_hardware.md
@@ -135,7 +135,7 @@ GPU1    PHB      X      0-11            N/A
 ```bash
 # DDP w/ NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
@@ -144,7 +144,7 @@ rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 python -m torch.distributed.launch
 
 # DDP w/o NVLink
 
-rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 python -m torch.distributed.launch \
+rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \
 --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path gpt2 \
 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train
 --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200
diff --git a/docs/source/zh/preprocessing.md b/docs/source/zh/preprocessing.md
index 95b799989c9197..f2b3189dd4a6c7 100644
--- a/docs/source/zh/preprocessing.md
+++ b/docs/source/zh/preprocessing.md
@@ -227,7 +227,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 对于音频任务，您需要[feature extractor](main_classes/feature_extractor)来准备您的数据集以供模型使用。`feature extractor`旨在从原始音频数据中提取特征，并将它们转换为张量。
 
-加载[MInDS-14](https://huggingface.co/datasets/PolyAI/minds14)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何在音频数据集中使用`feature extractor`：
+加载[MInDS-14](https://huggingface.co/datasets/PolyAI/minds14)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub)）以了解如何在音频数据集中使用`feature extractor`：
 
 
 ```py
@@ -352,7 +352,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 </Tip>
 
-加载[food101](https://huggingface.co/datasets/food101)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何在计算机视觉数据集中使用图像处理器：
+加载[food101](https://huggingface.co/datasets/food101)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub)）以了解如何在计算机视觉数据集中使用图像处理器：
 
 <Tip>
 
@@ -367,7 +367,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 >>> dataset = load_dataset("food101", split="train[:100]")
 ```
 
-接下来，使用🤗 Datasets的[`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image)功能查看图像：
+接下来，使用🤗 Datasets的[`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=image#datasets.Image)功能查看图像：
 
 
 ```py
@@ -421,7 +421,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 </Tip>
 
-3. 然后使用🤗 Datasets的[`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)在运行时应用这些变换：
+3. 然后使用🤗 Datasets的[`set_transform`](https://huggingface.co/docs/datasets/process#format-transform)在运行时应用这些变换：
 
 
 ```py
@@ -476,7 +476,7 @@ array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
 
 对于涉及多模态输入的任务，您需要[processor](main_classes/processors)来为模型准备数据集。`processor`将两个处理对象-例如`tokenizer`和`feature extractor`-组合在一起。
 
-加载[LJ Speech](https://huggingface.co/datasets/lj_speech)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets 教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何使用`processor`进行自动语音识别（ASR）：
+加载[LJ Speech](https://huggingface.co/datasets/lj_speech)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets 教程](https://huggingface.co/docs/datasets/load_hub)）以了解如何使用`processor`进行自动语音识别（ASR）：
 
 
 ```py
diff --git a/docs/source/zh/run_scripts.md b/docs/source/zh/run_scripts.md
index e5cc56487dabee..0a0121c32f0b27 100644
--- a/docs/source/zh/run_scripts.md
+++ b/docs/source/zh/run_scripts.md
@@ -133,7 +133,7 @@ python examples/tensorflow/summarization/run_summarization.py  \
 
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/summarization/run_summarization.py \
     --fp16 \
     --model_name_or_path t5-small \
diff --git a/docs/source/zh/training.md b/docs/source/zh/training.md
index 4ef49b459f95b9..89908130fe303a 100644
--- a/docs/source/zh/training.md
+++ b/docs/source/zh/training.md
@@ -43,7 +43,7 @@ rendered properly in your Markdown viewer.
  'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
 ```
 
-正如您现在所知，您需要一个`tokenizer`来处理文本，包括填充和截断操作以处理可变的序列长度。如果要一次性处理您的数据集，可以使用 🤗 Datasets 的 [`map`](https://huggingface.co/docs/datasets/process.html#map) 方法，将预处理函数应用于整个数据集：
+正如您现在所知，您需要一个`tokenizer`来处理文本，包括填充和截断操作以处理可变的序列长度。如果要一次性处理您的数据集，可以使用 🤗 Datasets 的 [`map`](https://huggingface.co/docs/datasets/process#map) 方法，将预处理函数应用于整个数据集：
 
 ```py
 >>> from transformers import AutoTokenizer
diff --git a/examples/flax/image-captioning/README.md b/examples/flax/image-captioning/README.md
index 0faf56124bc2d0..b76dc4cd057f66 100644
--- a/examples/flax/image-captioning/README.md
+++ b/examples/flax/image-captioning/README.md
@@ -1,7 +1,7 @@
 # Image Captioning (vision-encoder-text-decoder model) training example
 
 The following example showcases how to finetune a vision-encoder-text-decoder model for image captioning
-using the JAX/Flax backend, leveraging 🤗 Transformers library's [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/visionencoderdecoder#transformers.FlaxVisionEncoderDecoderModel).
+using the JAX/Flax backend, leveraging 🤗 Transformers library's [FlaxVisionEncoderDecoderModel](https://huggingface.co/docs/transformers/model_doc/vision-encoder-decoder#transformers.FlaxVisionEncoderDecoderModel).
 
 JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
 Models written in JAX/Flax are **immutable** and updated in a purely functional
@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism.
 `run_image_captioning_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets
 library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
 
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
+For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files and you also will find examples of these below.
 
 ### Download COCO dataset (2017)
 This example uses COCO dataset (2017) through a custom dataset script, which requires users to manually download the
diff --git a/examples/flax/image-captioning/run_image_captioning_flax.py b/examples/flax/image-captioning/run_image_captioning_flax.py
index 8f5e09e315eaae..859a006dbddc00 100644
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@@ -494,7 +494,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     model = FlaxVisionEncoderDecoderModel.from_pretrained(
diff --git a/examples/flax/language-modeling/run_bart_dlm_flax.py b/examples/flax/language-modeling/run_bart_dlm_flax.py
index 53af12fab68496..8603482218b422 100644
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@@ -589,7 +589,7 @@ def main():
                 num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/flax/language-modeling/run_clm_flax.py b/examples/flax/language-modeling/run_clm_flax.py
index 157c0b78f8e0c7..48d924f9bb3948 100755
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -484,7 +484,7 @@ def main():
                 num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/flax/language-modeling/run_mlm_flax.py b/examples/flax/language-modeling/run_mlm_flax.py
index d89b4c4dc93c9a..39fc5e7836376e 100755
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -516,7 +516,7 @@ def main():
                 num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/flax/language-modeling/run_t5_mlm_flax.py b/examples/flax/language-modeling/run_t5_mlm_flax.py
index bbf7b827c81c65..45d3fe32bcf9f1 100755
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -630,7 +630,7 @@ def main():
                 num_proc=data_args.preprocessing_num_workers,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/flax/question-answering/run_qa.py b/examples/flax/question-answering/run_qa.py
index c0bf8b63250fd8..51f57c1a04f4c6 100644
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@@ -536,7 +536,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Load pretrained model and tokenizer
diff --git a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
index 9f602e1c85862d..44721d6f41a14e 100644
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@@ -640,7 +640,7 @@ def compute_metrics(preds, labels):
 
     # Create learning rate schedule
     linear_decay_lr_schedule_fn = create_learning_rate_fn(
-        len(vectorized_datasets["train"]),
+        total_train_steps,
         training_args.warmup_steps,
         training_args.learning_rate,
     )
diff --git a/examples/flax/summarization/README.md b/examples/flax/summarization/README.md
index bbe231f31a569f..c94b048ec88b42 100644
--- a/examples/flax/summarization/README.md
+++ b/examples/flax/summarization/README.md
@@ -9,7 +9,7 @@ way which enables simple and efficient model parallelism.
 
 `run_summarization_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
 
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
+For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files and you also will find examples of these below.
 
 ### Train the model
 Next we can run the example script to train the model:
diff --git a/examples/flax/summarization/run_summarization_flax.py b/examples/flax/summarization/run_summarization_flax.py
index a7d6633f64f8b2..f39882362e2678 100644
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -521,7 +521,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/flax/text-classification/run_flax_glue.py b/examples/flax/text-classification/run_flax_glue.py
index 632a66841b3651..36b1ce58ec1131 100755
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -410,7 +410,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Labels
     if data_args.task_name is not None:
@@ -427,7 +427,7 @@ def main():
             num_labels = 1
         else:
             # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
             label_list = raw_datasets["train"].unique("label")
             label_list.sort()  # Let's sort it for determinism
             num_labels = len(label_list)
diff --git a/examples/flax/token-classification/run_flax_ner.py b/examples/flax/token-classification/run_flax_ner.py
index 2060508079df3a..ff5efd8a2d5382 100644
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@@ -465,7 +465,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if raw_datasets["train"] is not None:
         column_names = raw_datasets["train"].column_names
diff --git a/examples/legacy/question-answering/README.md b/examples/legacy/question-answering/README.md
index 494ae4ffd7eebf..905fabf35bdf6c 100644
--- a/examples/legacy/question-answering/README.md
+++ b/examples/legacy/question-answering/README.md
@@ -18,7 +18,7 @@ in Huang et al. [Improve Transformer Models with Better Relative Position Embedd
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
     --model_name_or_path zhiheng-huang/bert-base-uncased-embedding-relative-key-query \
     --dataset_name squad \
     --do_train \
@@ -46,7 +46,7 @@ gpu training leads to the f1 score of 90.71.
 
 ```bash
 export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
     --model_name_or_path zhiheng-huang/bert-large-uncased-whole-word-masking-embedding-relative-key-query \
     --dataset_name squad \
     --do_train \
@@ -68,7 +68,7 @@ Training with the above command leads to the f1 score of 93.52, which is slightl
 Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
+torchrun --nproc_per_node=8 ./examples/question-answering/run_squad.py \
     --model_name_or_path bert-large-uncased-whole-word-masking \
     --dataset_name squad \
     --do_train \
diff --git a/examples/legacy/seq2seq/README.md b/examples/legacy/seq2seq/README.md
index 5a3c2dbd3506be..347a980a74da05 100644
--- a/examples/legacy/seq2seq/README.md
+++ b/examples/legacy/seq2seq/README.md
@@ -140,7 +140,7 @@ python finetune_trainer.py --help
 
 For multi-gpu training use `torch.distributed.launch`, e.g. with 2 gpus:
 ```bash
-python -m torch.distributed.launch --nproc_per_node=2  finetune_trainer.py ...
+torchrun --nproc_per_node=2  finetune_trainer.py ...
 ```
 
 **At the moment, `Seq2SeqTrainer` does not support *with teacher* distillation.**
@@ -214,7 +214,7 @@ because it uses SortishSampler to minimize padding. You can also use it on 1 GPU
 `{type_path}.source` and `{type_path}.target`. Run `./run_distributed_eval.py --help` for all clargs.
 
 ```bash
-python -m torch.distributed.launch --nproc_per_node=8  run_distributed_eval.py \
+torchrun --nproc_per_node=8  run_distributed_eval.py \
     --model_name sshleifer/distilbart-large-xsum-12-3  \
     --save_dir xsum_generations \
     --data_dir xsum \
diff --git a/examples/pytorch/README.md b/examples/pytorch/README.md
index fd98a8e9180df0..ab2f05337c5db8 100644
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@@ -98,7 +98,7 @@ the [Trainer API](https://huggingface.co/transformers/main_classes/trainer.html)
 use the following command:
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node number_of_gpu_you_have path_to_script.py \
 	--all_arguments_of_the_script
 ```
@@ -107,7 +107,7 @@ As an example, here is how you would fine-tune the BERT large model (with whole
 classification MNLI task using the `run_glue` script, with 8 GPUs:
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
     --nproc_per_node 8 pytorch/text-classification/run_glue.py \
     --model_name_or_path bert-large-uncased-whole-word-masking \
     --task_name mnli \
diff --git a/examples/pytorch/contrastive-image-text/run_clip.py b/examples/pytorch/contrastive-image-text/run_clip.py
index e72db1f7f1d479..fe8cafa90a852d 100644
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@@ -340,7 +340,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py
index 7c668919253bc0..3677ce24b3aa60 100755
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -388,7 +388,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
@@ -510,7 +510,10 @@ def tokenize_function(examples):
                 f"The tokenizer picked seems to have a very large `model_max_length` ({tokenizer.model_max_length}). "
                 f"Using block_size={min(1024, max_pos_embeddings)} instead. You can change that default value by passing --block_size xxx."
             )
-            block_size = min(1024, max_pos_embeddings)
+            if max_pos_embeddings > 0:
+                block_size = min(1024, max_pos_embeddings)
+            else:
+                block_size = 1024
     else:
         if data_args.block_size > tokenizer.model_max_length:
             logger.warning(
diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py
index b14649483d5bf5..c95ce9a083368c 100755
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -368,7 +368,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py
index 3effeb16fc1e15..b6b01ee29e8364 100755
--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -382,7 +382,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
index 6427af1f4089f4..58974ed45940ee 100755
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -371,7 +371,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py
index 34c75149caeb33..f1d607c5fd4e71 100755
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -352,7 +352,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/multiple-choice/run_swag.py b/examples/pytorch/multiple-choice/run_swag.py
index 35a2ecd5e7947c..4304979671300b 100755
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -329,7 +329,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/pytorch/multiple-choice/run_swag_no_trainer.py b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
index 91c9337f4b8a9a..38e5eb02b121c1 100755
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -366,7 +366,7 @@ def main():
         for split in raw_datasets.keys():
             raw_datasets[split] = raw_datasets[split].select(range(100))
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if raw_datasets["train"] is not None:
         column_names = raw_datasets["train"].column_names
diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py
index ff007292bb1913..bdc6cb444f149b 100755
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -337,7 +337,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/question-answering/run_qa_beam_search.py b/examples/pytorch/question-answering/run_qa_beam_search.py
index 187afe569388af..f6809c6186debd 100755
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -325,7 +325,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
index ba813c32131150..ca5589e8e9a653 100644
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -369,7 +369,7 @@ def main():
         extension = args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files, field="data")
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/question-answering/run_qa_no_trainer.py b/examples/pytorch/question-answering/run_qa_no_trainer.py
index 97a72bf40cb5e2..2db77e1899c57c 100755
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -417,7 +417,7 @@ def main():
         extension = args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files, field="data")
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/question-answering/run_seq2seq_qa.py b/examples/pytorch/question-answering/run_seq2seq_qa.py
index cc5ccc97be556b..0cbc8860244974 100644
--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@@ -382,7 +382,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/speech-recognition/README.md b/examples/pytorch/speech-recognition/README.md
index 6ae2e1abef6026..32fa9ac8b8e3f5 100644
--- a/examples/pytorch/speech-recognition/README.md
+++ b/examples/pytorch/speech-recognition/README.md
@@ -100,7 +100,7 @@ of **0.35**.
 The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface.co/transformers/main/model_doc/xlsr_wav2vec2.html) on [Common Voice](https://huggingface.co/datasets/common_voice) using 8 GPUs in half-precision.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
 	--nproc_per_node 8 run_speech_recognition_ctc.py \
 	--dataset_name="common_voice" \
 	--model_name_or_path="facebook/wav2vec2-large-xlsr-53" \
@@ -134,7 +134,7 @@ of **0.36**.
 
 ### Multi GPU CTC with Dataset Streaming
 
-The following command shows how to use [Dataset Streaming mode](https://huggingface.co/docs/datasets/dataset_streaming.html)
+The following command shows how to use [Dataset Streaming mode](https://huggingface.co/docs/datasets/dataset_streaming)
 to fine-tune [XLS-R](https://huggingface.co/transformers/main/model_doc/xls_r.html) 
 on [Common Voice](https://huggingface.co/datasets/common_voice) using 4 GPUs in half-precision.
 
@@ -147,7 +147,7 @@ However, the `--shuffle_buffer_size` argument controls how many examples we can
 
 
 ```bash
-**python -m torch.distributed.launch \
+**torchrun \
 	--nproc_per_node 4 run_speech_recognition_ctc_streaming.py \
 	--dataset_name="common_voice" \
 	--model_name_or_path="facebook/wav2vec2-xls-r-300m" \
@@ -404,7 +404,7 @@ If training on a different language, you should be sure to change the `language`
 #### Multi GPU Whisper Training
 The following example shows how to fine-tune the [Whisper small](https://huggingface.co/openai/whisper-small) checkpoint on the Hindi subset of [Common Voice 11](https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0) using 2 GPU devices in half-precision:
 ```bash
-python -m torch.distributed.launch \
+torchrun \
  	--nproc_per_node 2 run_speech_recognition_seq2seq.py \
 	--model_name_or_path="openai/whisper-small" \
 	--dataset_name="mozilla-foundation/common_voice_11_0" \
@@ -572,7 +572,7 @@ cross-entropy loss of **0.405** and word error rate of **0.0728**.
 The following command shows how to fine-tune [XLSR-Wav2Vec2](https://huggingface.co/transformers/main/model_doc/xlsr_wav2vec2.html) on [Common Voice](https://huggingface.co/datasets/common_voice) using 8 GPUs in half-precision.
 
 ```bash
-python -m torch.distributed.launch \
+torchrun \
  	--nproc_per_node 8 run_speech_recognition_seq2seq.py \
 	--dataset_name="librispeech_asr" \
 	--model_name_or_path="./" \
diff --git a/examples/pytorch/summarization/README.md b/examples/pytorch/summarization/README.md
index db7f8f4061a5c9..027119681de020 100644
--- a/examples/pytorch/summarization/README.md
+++ b/examples/pytorch/summarization/README.md
@@ -33,7 +33,7 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
 
 `run_summarization.py` is a lightweight example of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
 
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files
+For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files
 and you also will find examples of these below.
 
 ## With Trainer
diff --git a/examples/pytorch/summarization/run_summarization.py b/examples/pytorch/summarization/run_summarization.py
index d7f543c24868d1..46e92a70c0a380 100755
--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -432,7 +432,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/summarization/run_summarization_no_trainer.py b/examples/pytorch/summarization/run_summarization_no_trainer.py
index 42232787dfa46f..6bc5f8a42eed5b 100644
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -409,7 +409,7 @@ def main():
         extension = args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/pytorch/test_accelerate_examples.py b/examples/pytorch/test_accelerate_examples.py
index 837e84e3320a1e..8749c8add77950 100644
--- a/examples/pytorch/test_accelerate_examples.py
+++ b/examples/pytorch/test_accelerate_examples.py
@@ -75,7 +75,7 @@ def setUpClass(cls):
     def tearDownClass(cls):
         shutil.rmtree(cls.tmpdir)
 
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_glue_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -100,7 +100,7 @@ def test_run_glue_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "glue_no_trainer")))
 
     @unittest.skip("Zach is working on this.")
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_clm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -128,7 +128,7 @@ def test_run_clm_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "clm_no_trainer")))
 
     @unittest.skip("Zach is working on this.")
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_mlm_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -148,7 +148,7 @@ def test_run_mlm_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "mlm_no_trainer")))
 
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_ner_no_trainer(self):
         # with so little data distributed training needs more epochs to get the score on par with 0/1 gpu
         epochs = 7 if backend_device_count(torch_device) > 1 else 2
@@ -176,7 +176,7 @@ def test_run_ner_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "ner_no_trainer")))
 
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_squad_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -204,7 +204,7 @@ def test_run_squad_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "epoch_0")))
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "qa_no_trainer")))
 
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_swag_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -227,7 +227,7 @@ def test_run_swag_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "swag_no_trainer")))
 
     @slow
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_summarization_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -255,7 +255,7 @@ def test_run_summarization_no_trainer(self):
         self.assertTrue(os.path.exists(os.path.join(tmp_dir, "summarization_no_trainer")))
 
     @slow
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_translation_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
@@ -306,7 +306,7 @@ def test_run_semantic_segmentation_no_trainer(self):
         result = get_results(tmp_dir)
         self.assertGreaterEqual(result["eval_overall_accuracy"], 0.10)
 
-    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline"})
+    @mock.patch.dict(os.environ, {"WANDB_MODE": "offline", "DVCLIVE_TEST": "true"})
     def test_run_image_classification_no_trainer(self):
         tmp_dir = self.get_auto_remove_tmp_dir()
         testargs = f"""
diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py
index 7e14c3deb69e4c..f7194405a1a865 100755
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@@ -396,7 +396,7 @@ def main():
             )
 
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if data_args.remove_splits is not None:
         for split in data_args.remove_splits.split(","):
diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py
index ff2644f8650780..343ee94843f439 100755
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -355,7 +355,7 @@ def main():
                 token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Labels
     if data_args.task_name is not None:
@@ -372,7 +372,7 @@ def main():
             num_labels = 1
         else:
             # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
             label_list = raw_datasets["train"].unique("label")
             label_list.sort()  # Let's sort it for determinism
             num_labels = len(label_list)
diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py
index e4332966becd36..ad6147d386941d 100644
--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -293,7 +293,7 @@ def main():
         extension = (args.train_file if args.train_file is not None else args.validation_file).split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Labels
     if args.task_name is not None:
diff --git a/examples/pytorch/token-classification/run_ner.py b/examples/pytorch/token-classification/run_ner.py
index ec77e8ea6a8227..3901191d0690fe 100755
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -318,7 +318,7 @@ def main():
         extension = data_args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if training_args.do_train:
         column_names = raw_datasets["train"].column_names
diff --git a/examples/pytorch/token-classification/run_ner_no_trainer.py b/examples/pytorch/token-classification/run_ner_no_trainer.py
index 1f83c65fcb54f1..42d1a70f37dab2 100755
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -348,7 +348,7 @@ def main():
         for split in raw_datasets.keys():
             raw_datasets[split] = raw_datasets[split].select(range(100))
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if raw_datasets["train"] is not None:
         column_names = raw_datasets["train"].column_names
diff --git a/examples/pytorch/translation/README.md b/examples/pytorch/translation/README.md
index 0593d577a01fdb..bd95e3a552150c 100644
--- a/examples/pytorch/translation/README.md
+++ b/examples/pytorch/translation/README.md
@@ -33,7 +33,7 @@ For the old `finetune_trainer.py` and related utils, see [`examples/legacy/seq2s
 
 `run_translation.py` is a lightweight examples of how to download and preprocess a dataset from the [🤗 Datasets](https://github.com/huggingface/datasets) library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
 
-For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files
+For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets#json-files
 and you also will find examples of these below.
 
 
diff --git a/examples/pytorch/translation/run_translation.py b/examples/pytorch/translation/run_translation.py
index 6edbe6a995c313..98780483a37e54 100755
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -374,8 +374,12 @@ def main():
         if data_args.test_file is not None:
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
+        if extension == "jsonl":
+            builder_name = "json"  # the "json" builder reads both .json and .jsonl files
+        else:
+            builder_name = extension  # e.g. "parquet"
         raw_datasets = load_dataset(
-            extension,
+            builder_name,
             data_files=data_files,
             cache_dir=model_args.cache_dir,
             token=model_args.token,
diff --git a/examples/pytorch/translation/run_translation_no_trainer.py b/examples/pytorch/translation/run_translation_no_trainer.py
index 35bc9a59da344a..1b7a1417a6829e 100644
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -389,7 +389,7 @@ def main():
         extension = args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/research_projects/jax-projects/README.md b/examples/research_projects/jax-projects/README.md
index fc5f09695522fe..420a97f7682a9c 100644
--- a/examples/research_projects/jax-projects/README.md
+++ b/examples/research_projects/jax-projects/README.md
@@ -227,7 +227,7 @@ the forum and making use of the [🤗 hub](http://huggingface.co/) to have a ver
 control for your models and training logs.
 - When debugging, it is important that the debugging cycle is kept as short as possible to 
 be able to effectively debug. *E.g.* if there is a problem with your training script, 
-you should run it with just a couple of hundreds of examples and not the whole dataset script. This can be done by either making use of [datasets streaming](https://huggingface.co/docs/datasets/master/dataset_streaming.html?highlight=streaming) or by selecting just the first 
+you should run it with just a couple of hundreds of examples and not the whole dataset script. This can be done by either making use of [datasets streaming](https://huggingface.co/docs/datasets/master/dataset_streaming?highlight=streaming) or by selecting just the first 
 X number of data samples after loading:
 
 ```python
diff --git a/examples/research_projects/jax-projects/dataset-streaming/README.md b/examples/research_projects/jax-projects/dataset-streaming/README.md
index 416eee06af33d6..35fc02acd29d4d 100644
--- a/examples/research_projects/jax-projects/dataset-streaming/README.md
+++ b/examples/research_projects/jax-projects/dataset-streaming/README.md
@@ -23,7 +23,7 @@ JAX/Flax allows you to trace pure functions and compile them into efficient, fus
 Models written in JAX/Flax are **immutable** and updated in a purely functional
 way which enables simple and efficient model parallelism.
 
-All of the following examples make use of [dataset streaming](https://huggingface.co/docs/datasets/master/dataset_streaming.html), therefore allowing to train models on massive datasets\
+All of the following examples make use of [dataset streaming](https://huggingface.co/docs/datasets/master/dataset_streaming), therefore allowing to train models on massive datasets\
 without ever having to download the full dataset.
 
 ## Masked language modeling
diff --git a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
index efe87c1b059f31..4ff4bd559d8ced 100644
--- a/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
+++ b/examples/research_projects/jax-projects/model_parallel/run_clm_mp.py
@@ -304,7 +304,7 @@ def main():
             extension = "text"
         dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained config and tokenizer
     if model_args.config_name:
diff --git a/examples/research_projects/jax-projects/wav2vec2/README.md b/examples/research_projects/jax-projects/wav2vec2/README.md
index 3b1b74743085a2..200e7ad933eebf 100644
--- a/examples/research_projects/jax-projects/wav2vec2/README.md
+++ b/examples/research_projects/jax-projects/wav2vec2/README.md
@@ -10,7 +10,7 @@ way which enables simple and efficient model parallelism.
 
 `run_wav2vec2_pretrain_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then pretrain the wav2vec2 architectures above on it.
 
-For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets.html#json-files) and you also will find examples of these below.
+For custom datasets in `jsonlines` format please see: [the Datasets documentation](https://huggingface.co/docs/datasets/loading_datasets#json-files) and you also will find examples of these below.
 
 Let's start by creating a model repository to save the trained model and logs.
 Here we call the model `"wav2vec2-base-robust"`, but you can change the model name as you like.
diff --git a/examples/research_projects/luke/run_luke_ner_no_trainer.py b/examples/research_projects/luke/run_luke_ner_no_trainer.py
index c1b573aee8140b..e03c665e4ec2cd 100644
--- a/examples/research_projects/luke/run_luke_ner_no_trainer.py
+++ b/examples/research_projects/luke/run_luke_ner_no_trainer.py
@@ -294,7 +294,7 @@ def main():
         for split in raw_datasets.keys():
             raw_datasets[split] = raw_datasets[split].select(range(100))
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if raw_datasets["train"] is not None:
         column_names = raw_datasets["train"].column_names
diff --git a/examples/research_projects/mlm_wwm/run_mlm_wwm.py b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
index d22b2db7dcade8..3a7326d38219c0 100644
--- a/examples/research_projects/mlm_wwm/run_mlm_wwm.py
+++ b/examples/research_projects/mlm_wwm/run_mlm_wwm.py
@@ -278,7 +278,7 @@ def main():
             extension = "text"
         datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/research_projects/performer/run_mlm_performer.py b/examples/research_projects/performer/run_mlm_performer.py
index c1ff5aa388a590..7c1f418815bed8 100644
--- a/examples/research_projects/performer/run_mlm_performer.py
+++ b/examples/research_projects/performer/run_mlm_performer.py
@@ -524,7 +524,7 @@ def generate_batch_splits(samples_idx: np.ndarray, batch_size: int) -> np.ndarra
             extension = "text"
         datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
 
diff --git a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
index 2055e6f4676b5b..f056e89206c6d3 100755
--- a/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
+++ b/examples/research_projects/quantization-qdqbert/evaluate-hf-trt-qa.py
@@ -272,7 +272,7 @@ def model_infer(inputs, context, d_inputs, h_output0, h_output1, d_output0, d_ou
 else:
     raise ValueError("Evaluation requires a dataset name")
 # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-# https://huggingface.co/docs/datasets/loading_datasets.html.
+# https://huggingface.co/docs/datasets/loading_datasets.
 
 # Preprocessing the datasets.
 # Preprocessing is slighlty different for training and evaluation.
diff --git a/examples/research_projects/quantization-qdqbert/run_quant_qa.py b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
index fac834ef70f3e0..3294b70da7e38e 100755
--- a/examples/research_projects/quantization-qdqbert/run_quant_qa.py
+++ b/examples/research_projects/quantization-qdqbert/run_quant_qa.py
@@ -308,7 +308,7 @@ def main():
             extension = data_args.test_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # set default quantization parameters before building model
     quant_trainer.set_default_quantizers(quant_trainer_args)
diff --git a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
index e0aa86a3a65ba9..20e0ea2d3cc2a2 100644
--- a/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag-end2end-retriever/use_own_knowledge_dataset.py
@@ -65,7 +65,7 @@ def main(
         "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
     )
 
-    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files
+    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
 
     # Then split the documents into passages of 100 words
     dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
diff --git a/examples/research_projects/rag/use_own_knowledge_dataset.py b/examples/research_projects/rag/use_own_knowledge_dataset.py
index 84d7c854975f11..d2ab6d07d5cc36 100644
--- a/examples/research_projects/rag/use_own_knowledge_dataset.py
+++ b/examples/research_projects/rag/use_own_knowledge_dataset.py
@@ -73,7 +73,7 @@ def main(
         "csv", data_files=[rag_example_args.csv_path], split="train", delimiter="\t", column_names=["title", "text"]
     )
 
-    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets.html?highlight=csv#csv-files
+    # More info about loading csv files in the documentation: https://huggingface.co/docs/datasets/loading_datasets?highlight=csv#csv-files
 
     # Then split the documents into passages of 100 words
     dataset = dataset.map(split_documents, batched=True, num_proc=processing_args.num_proc)
diff --git a/examples/research_projects/robust-speech-event/README.md b/examples/research_projects/robust-speech-event/README.md
index fd1a42c7d4bb58..4999950020b12d 100644
--- a/examples/research_projects/robust-speech-event/README.md
+++ b/examples/research_projects/robust-speech-event/README.md
@@ -112,7 +112,7 @@ Hugging Face Hub for additional audio data, for example by selecting the categor
 ["speech-processing"](https://huggingface.co/datasets?task_categories=task_categories:speech-processing&sort=downloads).
 All datasets that are available on the Hub can be downloaded via the 🤗 Datasets library in the same way Common Voice is downloaded.
 If one wants to combine multiple datasets for training, it might make sense to take a look at 
-the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=interleave#datasets.interleave_datasets) function.
+the [`interleave_datasets`](https://huggingface.co/docs/datasets/package_reference/main_classes?highlight=interleave#datasets.interleave_datasets) function.
 
 In addition, participants can also make use of their audio data. Here, please make sure that you **are allowed to use the audio data**. E.g., if audio data 
 is taken from media platforms, such as YouTube, it should be verified that the media platform and the owner of the data have given her/his approval to use the audio 
diff --git a/examples/research_projects/tapex/run_tabfact_with_tapex.py b/examples/research_projects/tapex/run_tabfact_with_tapex.py
index 2bef4a371ef68d..5dcec10a084c5f 100644
--- a/examples/research_projects/tapex/run_tabfact_with_tapex.py
+++ b/examples/research_projects/tapex/run_tabfact_with_tapex.py
@@ -277,7 +277,7 @@ def main():
             # Loading a dataset from local json files
             raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Labels
     label_list = raw_datasets["train"].features["label"].names
diff --git a/examples/research_projects/tapex/run_wikisql_with_tapex.py b/examples/research_projects/tapex/run_wikisql_with_tapex.py
index 821b283d9ff6c3..81e940a77c882c 100644
--- a/examples/research_projects/tapex/run_wikisql_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikisql_with_tapex.py
@@ -317,7 +317,7 @@ def main():
         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
index f874eebb3418b8..55350025cb3bb4 100644
--- a/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
+++ b/examples/research_projects/tapex/run_wikitablequestions_with_tapex.py
@@ -315,7 +315,7 @@ def main():
         datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/examples/tensorflow/contrastive-image-text/run_clip.py b/examples/tensorflow/contrastive-image-text/run_clip.py
index e56d66ce196c64..7f625616012b42 100644
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@@ -361,7 +361,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # 5. Load pretrained model, tokenizer, and image processor
     if model_args.tokenizer_name:
diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py
index 53c95b972a754c..3e4fe829682e5e 100644
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@@ -316,7 +316,7 @@ def main():
             task="image-classification",
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Prepare label mappings.
     # We'll include these in the model's config to get human readable labels in the Inference API.
diff --git a/examples/tensorflow/language-modeling/run_clm.py b/examples/tensorflow/language-modeling/run_clm.py
index d1555af174fecd..52b76f8fa0e41d 100755
--- a/examples/tensorflow/language-modeling/run_clm.py
+++ b/examples/tensorflow/language-modeling/run_clm.py
@@ -371,7 +371,7 @@ def main():
                 **dataset_args,
             )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Load pretrained model and tokenizer
diff --git a/examples/tensorflow/language-modeling/run_mlm.py b/examples/tensorflow/language-modeling/run_mlm.py
index 73c5d292cde5f2..5be9e0219b7182 100755
--- a/examples/tensorflow/language-modeling/run_mlm.py
+++ b/examples/tensorflow/language-modeling/run_mlm.py
@@ -353,7 +353,7 @@ def main():
         )
 
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Load pretrained model and tokenizer
diff --git a/examples/tensorflow/multiple-choice/run_swag.py b/examples/tensorflow/multiple-choice/run_swag.py
index db73e137b33d82..7b0cc7af9da7ec 100644
--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@@ -338,7 +338,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # When using your own dataset or a different dataset from swag, you will probably need to change this.
     ending_names = [f"ending{i}" for i in range(4)]
diff --git a/examples/tensorflow/question-answering/run_qa.py b/examples/tensorflow/question-answering/run_qa.py
index 4d458c1190ddfe..9f1fd997f1c4f8 100755
--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@@ -352,7 +352,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Load pretrained model and tokenizer
diff --git a/examples/tensorflow/summarization/run_summarization.py b/examples/tensorflow/summarization/run_summarization.py
index c608933993400e..6781801f64cd8f 100644
--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@@ -401,7 +401,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Load model config and tokenizer
diff --git a/examples/tensorflow/text-classification/run_glue.py b/examples/tensorflow/text-classification/run_glue.py
index 618296aa71a8db..642bf57d1ae1eb 100644
--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@@ -271,7 +271,7 @@ def main():
         token=model_args.token,
     )
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     is_regression = data_args.task_name == "stsb"
     if not is_regression:
diff --git a/examples/tensorflow/text-classification/run_text_classification.py b/examples/tensorflow/text-classification/run_text_classification.py
index 169f109d2d9211..0c0d989c4cb318 100644
--- a/examples/tensorflow/text-classification/run_text_classification.py
+++ b/examples/tensorflow/text-classification/run_text_classification.py
@@ -290,7 +290,7 @@ def main():
         # Loading a dataset from local json files
         datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
     # endregion
 
     # region Label preprocessing
diff --git a/examples/tensorflow/token-classification/run_ner.py b/examples/tensorflow/token-classification/run_ner.py
index bca64df92479dc..31dff57862c7d5 100644
--- a/examples/tensorflow/token-classification/run_ner.py
+++ b/examples/tensorflow/token-classification/run_ner.py
@@ -269,7 +269,7 @@ def main():
             token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     if raw_datasets["train"] is not None:
         column_names = raw_datasets["train"].column_names
diff --git a/setup.py b/setup.py
index deccac468a8a62..cc653bd82d177c 100644
--- a/setup.py
+++ b/setup.py
@@ -95,8 +95,8 @@
 # 1. all dependencies should be listed here with their version requirements if any
 # 2. once modified, run: `make deps_table_update` to update src/transformers/dependency_versions_table.py
 _deps = [
-    "Pillow<10.0.0",
-    "accelerate>=0.20.3",
+    "Pillow>=10.0.1,<=15.0",
+    "accelerate>=0.21.0",
     "av==9.2.0",  # Latest version of PyAV (10.0.0) has issues with audio stream.
     "beautifulsoup4",
     "codecarbon==1.2.0",
@@ -149,13 +149,13 @@
     "pytest-timeout",
     "pytest-xdist",
     "python>=3.8.0",
-    "ray[tune]",
+    "ray[tune]>=2.7.0",
     "regex!=2019.12.17",
     "requests",
     "rhoknp>=1.1.0,<1.3.1",
     "rjieba",
     "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff>=0.1.5,<=0.2",
+    "ruff==0.1.5",
     "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses",
     "safetensors>=0.3.1",
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index cf89602b6597a3..614e5e8e77a4cb 100644
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -186,16 +186,28 @@
         "WordpieceTokenizer",
     ],
     "models.bert_generation": ["BertGenerationConfig"],
-    "models.bert_japanese": ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"],
+    "models.bert_japanese": [
+        "BertJapaneseTokenizer",
+        "CharacterTokenizer",
+        "MecabTokenizer",
+    ],
     "models.bertweet": ["BertweetTokenizer"],
     "models.big_bird": ["BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP", "BigBirdConfig"],
     "models.bigbird_pegasus": [
         "BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BigBirdPegasusConfig",
     ],
-    "models.biogpt": ["BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BioGptConfig", "BioGptTokenizer"],
+    "models.biogpt": [
+        "BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BioGptConfig",
+        "BioGptTokenizer",
+    ],
     "models.bit": ["BIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BitConfig"],
-    "models.blenderbot": ["BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP", "BlenderbotConfig", "BlenderbotTokenizer"],
+    "models.blenderbot": [
+        "BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BlenderbotConfig",
+        "BlenderbotTokenizer",
+    ],
     "models.blenderbot_small": [
         "BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "BlenderbotSmallConfig",
@@ -223,10 +235,18 @@
         "BridgeTowerTextConfig",
         "BridgeTowerVisionConfig",
     ],
-    "models.bros": ["BROS_PRETRAINED_CONFIG_ARCHIVE_MAP", "BrosConfig", "BrosProcessor"],
+    "models.bros": [
+        "BROS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "BrosConfig",
+        "BrosProcessor",
+    ],
     "models.byt5": ["ByT5Tokenizer"],
     "models.camembert": ["CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CamembertConfig"],
-    "models.canine": ["CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP", "CanineConfig", "CanineTokenizer"],
+    "models.canine": [
+        "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CanineConfig",
+        "CanineTokenizer",
+    ],
     "models.chinese_clip": [
         "CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "ChineseCLIPConfig",
@@ -266,14 +286,36 @@
         "ClvpTokenizer",
     ],
     "models.code_llama": [],
-    "models.codegen": ["CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP", "CodeGenConfig", "CodeGenTokenizer"],
-    "models.conditional_detr": ["CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConditionalDetrConfig"],
-    "models.convbert": ["CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvBertConfig", "ConvBertTokenizer"],
+    "models.codegen": [
+        "CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CodeGenConfig",
+        "CodeGenTokenizer",
+    ],
+    "models.conditional_detr": [
+        "CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConditionalDetrConfig",
+    ],
+    "models.convbert": [
+        "CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConvBertConfig",
+        "ConvBertTokenizer",
+    ],
     "models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
-    "models.convnextv2": ["CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextV2Config"],
+    "models.convnextv2": [
+        "CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ConvNextV2Config",
+    ],
     "models.cpm": [],
-    "models.cpmant": ["CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CpmAntConfig", "CpmAntTokenizer"],
-    "models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
+    "models.cpmant": [
+        "CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CpmAntConfig",
+        "CpmAntTokenizer",
+    ],
+    "models.ctrl": [
+        "CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "CTRLConfig",
+        "CTRLTokenizer",
+    ],
     "models.cvt": ["CVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "CvtConfig"],
     "models.data2vec": [
         "DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -282,10 +324,23 @@
         "Data2VecTextConfig",
         "Data2VecVisionConfig",
     ],
-    "models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
-    "models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
-    "models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
-    "models.deformable_detr": ["DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeformableDetrConfig"],
+    "models.deberta": [
+        "DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DebertaConfig",
+        "DebertaTokenizer",
+    ],
+    "models.deberta_v2": [
+        "DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DebertaV2Config",
+    ],
+    "models.decision_transformer": [
+        "DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DecisionTransformerConfig",
+    ],
+    "models.deformable_detr": [
+        "DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DeformableDetrConfig",
+    ],
     "models.deit": ["DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DeiTConfig"],
     "models.deprecated": [],
     "models.deprecated.bort": [],
@@ -296,7 +351,10 @@
         "MCTCTProcessor",
     ],
     "models.deprecated.mmbt": ["MMBTConfig"],
-    "models.deprecated.open_llama": ["OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenLlamaConfig"],
+    "models.deprecated.open_llama": [
+        "OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "OpenLlamaConfig",
+    ],
     "models.deprecated.retribert": [
         "RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "RetriBertConfig",
@@ -307,15 +365,29 @@
         "TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TrajectoryTransformerConfig",
     ],
+    "models.deprecated.transfo_xl": [
+        "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TransfoXLConfig",
+        "TransfoXLCorpus",
+        "TransfoXLTokenizer",
+    ],
     "models.deprecated.van": ["VAN_PRETRAINED_CONFIG_ARCHIVE_MAP", "VanConfig"],
     "models.deta": ["DETA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetaConfig"],
     "models.detr": ["DETR_PRETRAINED_CONFIG_ARCHIVE_MAP", "DetrConfig"],
     "models.dialogpt": [],
     "models.dinat": ["DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DinatConfig"],
     "models.dinov2": ["DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Dinov2Config"],
-    "models.distilbert": ["DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DistilBertConfig", "DistilBertTokenizer"],
+    "models.distilbert": [
+        "DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DistilBertConfig",
+        "DistilBertTokenizer",
+    ],
     "models.dit": [],
-    "models.donut": ["DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "DonutProcessor", "DonutSwinConfig"],
+    "models.donut": [
+        "DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "DonutProcessor",
+        "DonutSwinConfig",
+    ],
     "models.dpr": [
         "DPR_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "DPRConfig",
@@ -325,9 +397,19 @@
         "DPRReaderTokenizer",
     ],
     "models.dpt": ["DPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "DPTConfig"],
-    "models.efficientformer": ["EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientFormerConfig"],
-    "models.efficientnet": ["EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "EfficientNetConfig"],
-    "models.electra": ["ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "ElectraConfig", "ElectraTokenizer"],
+    "models.efficientformer": [
+        "EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EfficientFormerConfig",
+    ],
+    "models.efficientnet": [
+        "EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "EfficientNetConfig",
+    ],
+    "models.electra": [
+        "ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ElectraConfig",
+        "ElectraTokenizer",
+    ],
     "models.encodec": [
         "ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "EncodecConfig",
@@ -341,7 +423,11 @@
     "models.ernie_m": ["ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP", "ErnieMConfig"],
     "models.esm": ["ESM_PRETRAINED_CONFIG_ARCHIVE_MAP", "EsmConfig", "EsmTokenizer"],
     "models.falcon": ["FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP", "FalconConfig"],
-    "models.flaubert": ["FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FlaubertConfig", "FlaubertTokenizer"],
+    "models.flaubert": [
+        "FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "FlaubertConfig",
+        "FlaubertTokenizer",
+    ],
     "models.flava": [
         "FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "FlavaConfig",
@@ -352,16 +438,39 @@
     ],
     "models.fnet": ["FNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FNetConfig"],
     "models.focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"],
-    "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
-    "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
+    "models.fsmt": [
+        "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "FSMTConfig",
+        "FSMTTokenizer",
+    ],
+    "models.funnel": [
+        "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "FunnelConfig",
+        "FunnelTokenizer",
+    ],
     "models.fuyu": ["FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP", "FuyuConfig"],
-    "models.git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitProcessor", "GitVisionConfig"],
+    "models.git": [
+        "GIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GitConfig",
+        "GitProcessor",
+        "GitVisionConfig",
+    ],
     "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"],
-    "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
-    "models.gpt_bigcode": ["GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTBigCodeConfig"],
+    "models.gpt2": [
+        "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GPT2Config",
+        "GPT2Tokenizer",
+    ],
+    "models.gpt_bigcode": [
+        "GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GPTBigCodeConfig",
+    ],
     "models.gpt_neo": ["GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoConfig"],
     "models.gpt_neox": ["GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXConfig"],
-    "models.gpt_neox_japanese": ["GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTNeoXJapaneseConfig"],
+    "models.gpt_neox_japanese": [
+        "GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GPTNeoXJapaneseConfig",
+    ],
     "models.gpt_sw3": [],
     "models.gptj": ["GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPTJConfig"],
     "models.gptsan_japanese": [
@@ -369,7 +478,10 @@
         "GPTSanJapaneseConfig",
         "GPTSanJapaneseTokenizer",
     ],
-    "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"],
+    "models.graphormer": [
+        "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "GraphormerConfig",
+    ],
     "models.groupvit": [
         "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "GroupViTConfig",
@@ -404,7 +516,11 @@
         "Kosmos2Config",
         "Kosmos2Processor",
     ],
-    "models.layoutlm": ["LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "LayoutLMConfig", "LayoutLMTokenizer"],
+    "models.layoutlm": [
+        "LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LayoutLMConfig",
+        "LayoutLMTokenizer",
+    ],
     "models.layoutlmv2": [
         "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "LayoutLMv2Config",
@@ -426,10 +542,26 @@
     "models.levit": ["LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LevitConfig"],
     "models.lilt": ["LILT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LiltConfig"],
     "models.llama": ["LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlamaConfig"],
-    "models.longformer": ["LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongformerConfig", "LongformerTokenizer"],
+    "models.llava": [
+        "LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LlavaConfig",
+    ],
+    "models.longformer": [
+        "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LongformerConfig",
+        "LongformerTokenizer",
+    ],
     "models.longt5": ["LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP", "LongT5Config"],
-    "models.luke": ["LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP", "LukeConfig", "LukeTokenizer"],
-    "models.lxmert": ["LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "LxmertConfig", "LxmertTokenizer"],
+    "models.luke": [
+        "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LukeConfig",
+        "LukeTokenizer",
+    ],
+    "models.lxmert": [
+        "LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "LxmertConfig",
+        "LxmertTokenizer",
+    ],
     "models.m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
     "models.marian": ["MarianConfig"],
     "models.markuplm": [
@@ -443,21 +575,51 @@
         "MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Mask2FormerConfig",
     ],
-    "models.maskformer": ["MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "MaskFormerConfig", "MaskFormerSwinConfig"],
+    "models.maskformer": [
+        "MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MaskFormerConfig",
+        "MaskFormerSwinConfig",
+    ],
     "models.mbart": ["MBartConfig"],
     "models.mbart50": [],
     "models.mega": ["MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegaConfig"],
-    "models.megatron_bert": ["MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MegatronBertConfig"],
+    "models.megatron_bert": [
+        "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MegatronBertConfig",
+    ],
     "models.megatron_gpt2": [],
-    "models.mgp_str": ["MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP", "MgpstrConfig", "MgpstrProcessor", "MgpstrTokenizer"],
+    "models.mgp_str": [
+        "MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MgpstrConfig",
+        "MgpstrProcessor",
+        "MgpstrTokenizer",
+    ],
     "models.mistral": ["MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MistralConfig"],
+    "models.mixtral": ["MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MixtralConfig"],
     "models.mluke": [],
-    "models.mobilebert": ["MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileBertConfig", "MobileBertTokenizer"],
-    "models.mobilenet_v1": ["MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV1Config"],
-    "models.mobilenet_v2": ["MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileNetV2Config"],
+    "models.mobilebert": [
+        "MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileBertConfig",
+        "MobileBertTokenizer",
+    ],
+    "models.mobilenet_v1": [
+        "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileNetV1Config",
+    ],
+    "models.mobilenet_v2": [
+        "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileNetV2Config",
+    ],
     "models.mobilevit": ["MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTConfig"],
-    "models.mobilevitv2": ["MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "MobileViTV2Config"],
-    "models.mpnet": ["MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "MPNetConfig", "MPNetTokenizer"],
+    "models.mobilevitv2": [
+        "MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MobileViTV2Config",
+    ],
+    "models.mpnet": [
+        "MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "MPNetConfig",
+        "MPNetTokenizer",
+    ],
     "models.mpt": ["MPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "MptConfig"],
     "models.mra": ["MRA_PRETRAINED_CONFIG_ARCHIVE_MAP", "MraConfig"],
     "models.mt5": ["MT5Config"],
@@ -476,8 +638,16 @@
         "NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "NystromformerConfig",
     ],
-    "models.oneformer": ["ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "OneFormerConfig", "OneFormerProcessor"],
-    "models.openai": ["OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "OpenAIGPTConfig", "OpenAIGPTTokenizer"],
+    "models.oneformer": [
+        "ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "OneFormerConfig",
+        "OneFormerProcessor",
+    ],
+    "models.openai": [
+        "OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "OpenAIGPTConfig",
+        "OpenAIGPTTokenizer",
+    ],
     "models.opt": ["OPTConfig"],
     "models.owlv2": [
         "OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -493,9 +663,22 @@
         "OwlViTTextConfig",
         "OwlViTVisionConfig",
     ],
-    "models.pegasus": ["PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusConfig", "PegasusTokenizer"],
+    "models.patchtsmixer": [
+        "PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PatchTSMixerConfig",
+    ],
+    "models.patchtst": ["PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP", "PatchTSTConfig"],
+    "models.pegasus": [
+        "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PegasusConfig",
+        "PegasusTokenizer",
+    ],
     "models.pegasus_x": ["PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP", "PegasusXConfig"],
-    "models.perceiver": ["PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PerceiverConfig", "PerceiverTokenizer"],
+    "models.perceiver": [
+        "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PerceiverConfig",
+        "PerceiverTokenizer",
+    ],
     "models.persimmon": ["PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP", "PersimmonConfig"],
     "models.phi": ["PHI_PRETRAINED_CONFIG_ARCHIVE_MAP", "PhiConfig"],
     "models.phobert": ["PhobertTokenizer"],
@@ -507,24 +690,50 @@
         "Pix2StructVisionConfig",
     ],
     "models.plbart": ["PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP", "PLBartConfig"],
-    "models.poolformer": ["POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "PoolFormerConfig"],
+    "models.poolformer": [
+        "POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PoolFormerConfig",
+    ],
     "models.pop2piano": [
         "POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "Pop2PianoConfig",
     ],
-    "models.prophetnet": ["PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ProphetNetConfig", "ProphetNetTokenizer"],
+    "models.prophetnet": [
+        "PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ProphetNetConfig",
+        "ProphetNetTokenizer",
+    ],
     "models.pvt": ["PVT_PRETRAINED_CONFIG_ARCHIVE_MAP", "PvtConfig"],
     "models.qdqbert": ["QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "QDQBertConfig"],
     "models.rag": ["RagConfig", "RagRetriever", "RagTokenizer"],
-    "models.realm": ["REALM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RealmConfig", "RealmTokenizer"],
+    "models.realm": [
+        "REALM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RealmConfig",
+        "RealmTokenizer",
+    ],
     "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"],
     "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"],
     "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"],
     "models.resnet": ["RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "ResNetConfig"],
-    "models.roberta": ["ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaConfig", "RobertaTokenizer"],
-    "models.roberta_prelayernorm": ["ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP", "RobertaPreLayerNormConfig"],
-    "models.roc_bert": ["ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoCBertConfig", "RoCBertTokenizer"],
-    "models.roformer": ["ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "RoFormerConfig", "RoFormerTokenizer"],
+    "models.roberta": [
+        "ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RobertaConfig",
+        "RobertaTokenizer",
+    ],
+    "models.roberta_prelayernorm": [
+        "ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RobertaPreLayerNormConfig",
+    ],
+    "models.roc_bert": [
+        "ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RoCBertConfig",
+        "RoCBertTokenizer",
+    ],
+    "models.roformer": [
+        "ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "RoFormerConfig",
+        "RoFormerTokenizer",
+    ],
     "models.rwkv": ["RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP", "RwkvConfig"],
     "models.sam": [
         "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -540,6 +749,10 @@
         "SeamlessM4TFeatureExtractor",
         "SeamlessM4TProcessor",
     ],
+    "models.seamless_m4t_v2": [
+        "SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SeamlessM4Tv2Config",
+    ],
     "models.segformer": ["SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SegformerConfig"],
     "models.sew": ["SEW_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWConfig"],
     "models.sew_d": ["SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP", "SEWDConfig"],
@@ -564,28 +777,46 @@
         "SpeechT5HifiGanConfig",
         "SpeechT5Processor",
     ],
-    "models.splinter": ["SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SplinterConfig", "SplinterTokenizer"],
-    "models.squeezebert": ["SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "SqueezeBertConfig", "SqueezeBertTokenizer"],
-    "models.swiftformer": ["SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwiftFormerConfig"],
+    "models.splinter": [
+        "SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SplinterConfig",
+        "SplinterTokenizer",
+    ],
+    "models.squeezebert": [
+        "SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SqueezeBertConfig",
+        "SqueezeBertTokenizer",
+    ],
+    "models.swiftformer": [
+        "SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SwiftFormerConfig",
+    ],
     "models.swin": ["SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwinConfig"],
     "models.swin2sr": ["SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swin2SRConfig"],
     "models.swinv2": ["SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP", "Swinv2Config"],
-    "models.switch_transformers": ["SWITCH_TRANSFORMERS_PRETRAINED_CONFIG_ARCHIVE_MAP", "SwitchTransformersConfig"],
+    "models.switch_transformers": [
+        "SWITCH_TRANSFORMERS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "SwitchTransformersConfig",
+    ],
     "models.t5": ["T5_PRETRAINED_CONFIG_ARCHIVE_MAP", "T5Config"],
-    "models.table_transformer": ["TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TableTransformerConfig"],
-    "models.tapas": ["TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP", "TapasConfig", "TapasTokenizer"],
+    "models.table_transformer": [
+        "TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TableTransformerConfig",
+    ],
+    "models.tapas": [
+        "TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TapasConfig",
+        "TapasTokenizer",
+    ],
     "models.time_series_transformer": [
         "TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TimeSeriesTransformerConfig",
     ],
-    "models.timesformer": ["TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "TimesformerConfig"],
-    "models.timm_backbone": ["TimmBackboneConfig"],
-    "models.transfo_xl": [
-        "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
-        "TransfoXLConfig",
-        "TransfoXLCorpus",
-        "TransfoXLTokenizer",
+    "models.timesformer": [
+        "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TimesformerConfig",
     ],
+    "models.timm_backbone": ["TimmBackboneConfig"],
     "models.trocr": [
         "TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "TrOCRConfig",
@@ -597,6 +828,11 @@
         "TvltFeatureExtractor",
         "TvltProcessor",
     ],
+    "models.tvp": [
+        "TVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TvpConfig",
+        "TvpProcessor",
+    ],
     "models.umt5": ["UMT5Config"],
     "models.unispeech": [
         "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP",
@@ -606,6 +842,11 @@
         "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP",
         "UniSpeechSatConfig",
     ],
+    "models.univnet": [
+        "UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "UnivNetConfig",
+        "UnivNetFeatureExtractor",
+    ],
     "models.upernet": ["UperNetConfig"],
     "models.videomae": ["VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "VideoMAEConfig"],
     "models.vilt": [
@@ -615,11 +856,24 @@
         "ViltImageProcessor",
         "ViltProcessor",
     ],
+    "models.vipllava": [
+        "VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VipLlavaConfig",
+    ],
     "models.vision_encoder_decoder": ["VisionEncoderDecoderConfig"],
-    "models.vision_text_dual_encoder": ["VisionTextDualEncoderConfig", "VisionTextDualEncoderProcessor"],
-    "models.visual_bert": ["VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "VisualBertConfig"],
+    "models.vision_text_dual_encoder": [
+        "VisionTextDualEncoderConfig",
+        "VisionTextDualEncoderProcessor",
+    ],
+    "models.visual_bert": [
+        "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "VisualBertConfig",
+    ],
     "models.vit": ["VIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTConfig"],
-    "models.vit_hybrid": ["VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTHybridConfig"],
+    "models.vit_hybrid": [
+        "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "ViTHybridConfig",
+    ],
     "models.vit_mae": ["VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMAEConfig"],
     "models.vit_msn": ["VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP", "ViTMSNConfig"],
     "models.vitdet": ["VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP", "VitDetConfig"],
@@ -667,9 +921,18 @@
     ],
     "models.xglm": ["XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XGLMConfig"],
     "models.xlm": ["XLM_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMConfig", "XLMTokenizer"],
-    "models.xlm_prophetnet": ["XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMProphetNetConfig"],
-    "models.xlm_roberta": ["XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaConfig"],
-    "models.xlm_roberta_xl": ["XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLMRobertaXLConfig"],
+    "models.xlm_prophetnet": [
+        "XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XLMProphetNetConfig",
+    ],
+    "models.xlm_roberta": [
+        "XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XLMRobertaConfig",
+    ],
+    "models.xlm_roberta_xl": [
+        "XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "XLMRobertaXLConfig",
+    ],
     "models.xlnet": ["XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "XLNetConfig"],
     "models.xmod": ["XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP", "XmodConfig"],
     "models.yolos": ["YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP", "YolosConfig"],
@@ -745,7 +1008,13 @@
         "TrainerControl",
         "TrainerState",
     ],
-    "trainer_utils": ["EvalPrediction", "IntervalStrategy", "SchedulerType", "enable_full_determinism", "set_seed"],
+    "trainer_utils": [
+        "EvalPrediction",
+        "IntervalStrategy",
+        "SchedulerType",
+        "enable_full_determinism",
+        "set_seed",
+    ],
     "training_args": ["TrainingArguments"],
     "training_args_seq2seq": ["Seq2SeqTrainingArguments"],
     "training_args_tf": ["TFTrainingArguments"],
@@ -870,7 +1139,11 @@
     _import_structure["models.deprecated.retribert"].append("RetriBertTokenizerFast")
     _import_structure["models.distilbert"].append("DistilBertTokenizerFast")
     _import_structure["models.dpr"].extend(
-        ["DPRContextEncoderTokenizerFast", "DPRQuestionEncoderTokenizerFast", "DPRReaderTokenizerFast"]
+        [
+            "DPRContextEncoderTokenizerFast",
+            "DPRQuestionEncoderTokenizerFast",
+            "DPRReaderTokenizerFast",
+        ]
     )
     _import_structure["models.electra"].append("ElectraTokenizerFast")
     _import_structure["models.fnet"].append("FNetTokenizerFast")
@@ -924,7 +1197,10 @@
         name for name in dir(dummy_sentencepiece_and_tokenizers_objects) if not name.startswith("_")
     ]
 else:
-    _import_structure["convert_slow_tokenizer"] = ["SLOW_TO_FAST_CONVERTERS", "convert_slow_tokenizer"]
+    _import_structure["convert_slow_tokenizer"] = [
+        "SLOW_TO_FAST_CONVERTERS",
+        "convert_slow_tokenizer",
+    ]
 
 # Tensorflow-text-specific objects
 try:
@@ -1010,6 +1286,7 @@
     _import_structure["models.segformer"].extend(["SegformerFeatureExtractor", "SegformerImageProcessor"])
     _import_structure["models.swin2sr"].append("Swin2SRImageProcessor")
     _import_structure["models.tvlt"].append("TvltImageProcessor")
+    _import_structure["models.tvp"].append("TvpImageProcessor")
     _import_structure["models.videomae"].extend(["VideoMAEFeatureExtractor", "VideoMAEImageProcessor"])
     _import_structure["models.vilt"].extend(["ViltFeatureExtractor", "ViltImageProcessor", "ViltProcessor"])
     _import_structure["models.vit"].extend(["ViTFeatureExtractor", "ViTImageProcessor"])
@@ -1031,6 +1308,7 @@
     _import_structure["activations"] = []
     _import_structure["benchmark.benchmark"] = ["PyTorchBenchmark"]
     _import_structure["benchmark.benchmark_args"] = ["PyTorchBenchmarkArguments"]
+    _import_structure["cache_utils"] = ["Cache", "DynamicCache", "SinkCache"]
     _import_structure["data.datasets"] = [
         "GlueDataset",
         "GlueDataTrainingArguments",
@@ -1167,6 +1445,8 @@
             "MODEL_FOR_TEXT_ENCODING_MAPPING",
             "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING",
             "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING",
+            "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
+            "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
             "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
             "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING",
             "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING",
@@ -1250,6 +1530,7 @@
     _import_structure["models.beit"].extend(
         [
             "BEIT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "BeitBackbone",
             "BeitForImageClassification",
             "BeitForMaskedImageModeling",
             "BeitForSemanticSegmentation",
@@ -1638,10 +1919,19 @@
     )
     _import_structure["models.deprecated.mmbt"].extend(["MMBTForClassification", "MMBTModel", "ModalEmbeddings"])
     _import_structure["models.deprecated.open_llama"].extend(
-        ["OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification", "OpenLlamaModel", "OpenLlamaPreTrainedModel"]
+        [
+            "OpenLlamaForCausalLM",
+            "OpenLlamaForSequenceClassification",
+            "OpenLlamaModel",
+            "OpenLlamaPreTrainedModel",
+        ]
     )
     _import_structure["models.deprecated.retribert"].extend(
-        ["RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST", "RetriBertModel", "RetriBertPreTrainedModel"]
+        [
+            "RETRIBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "RetriBertModel",
+            "RetriBertPreTrainedModel",
+        ]
     )
     _import_structure["models.deprecated.trajectory_transformer"].extend(
         [
@@ -1650,6 +1940,17 @@
             "TrajectoryTransformerPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.transfo_xl"].extend(
+        [
+            "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "AdaptiveEmbedding",
+            "TransfoXLForSequenceClassification",
+            "TransfoXLLMHeadModel",
+            "TransfoXLModel",
+            "TransfoXLPreTrainedModel",
+            "load_tf_weights_in_transfo_xl",
+        ]
+    )
     _import_structure["models.deprecated.van"].extend(
         [
             "VAN_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2135,7 +2436,20 @@
         ]
     )
     _import_structure["models.llama"].extend(
-        ["LlamaForCausalLM", "LlamaForSequenceClassification", "LlamaModel", "LlamaPreTrainedModel"]
+        [
+            "LlamaForCausalLM",
+            "LlamaForSequenceClassification",
+            "LlamaModel",
+            "LlamaPreTrainedModel",
+        ]
+    )
+    _import_structure["models.llava"].extend(
+        [
+            "LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "LlavaForConditionalGeneration",
+            "LlavaPreTrainedModel",
+            "LlavaProcessor",
+        ]
     )
     _import_structure["models.longformer"].extend(
         [
@@ -2268,7 +2582,15 @@
         ]
     )
     _import_structure["models.mistral"].extend(
-        ["MistralForCausalLM", "MistralForSequenceClassification", "MistralModel", "MistralPreTrainedModel"]
+        [
+            "MistralForCausalLM",
+            "MistralForSequenceClassification",
+            "MistralModel",
+            "MistralPreTrainedModel",
+        ]
+    )
+    _import_structure["models.mixtral"].extend(
+        ["MixtralForCausalLM", "MixtralForSequenceClassification", "MixtralModel", "MixtralPreTrainedModel"]
     )
     _import_structure["models.mobilebert"].extend(
         [
@@ -2485,8 +2807,35 @@
             "OwlViTVisionModel",
         ]
     )
+    _import_structure["models.patchtsmixer"].extend(
+        [
+            "PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PatchTSMixerForPrediction",
+            "PatchTSMixerForPretraining",
+            "PatchTSMixerForRegression",
+            "PatchTSMixerForTimeSeriesClassification",
+            "PatchTSMixerModel",
+            "PatchTSMixerPreTrainedModel",
+        ]
+    )
+    _import_structure["models.patchtst"].extend(
+        [
+            "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "PatchTSTForClassification",
+            "PatchTSTForPrediction",
+            "PatchTSTForPretraining",
+            "PatchTSTForRegression",
+            "PatchTSTModel",
+            "PatchTSTPreTrainedModel",
+        ]
+    )
     _import_structure["models.pegasus"].extend(
-        ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"]
+        [
+            "PegasusForCausalLM",
+            "PegasusForConditionalGeneration",
+            "PegasusModel",
+            "PegasusPreTrainedModel",
+        ]
     )
     _import_structure["models.pegasus_x"].extend(
         [
@@ -2512,7 +2861,12 @@
         ]
     )
     _import_structure["models.persimmon"].extend(
-        ["PersimmonForCausalLM", "PersimmonForSequenceClassification", "PersimmonModel", "PersimmonPreTrainedModel"]
+        [
+            "PersimmonForCausalLM",
+            "PersimmonForSequenceClassification",
+            "PersimmonModel",
+            "PersimmonPreTrainedModel",
+        ]
     )
     _import_structure["models.phi"].extend(
         [
@@ -2594,7 +2948,12 @@
         ]
     )
     _import_structure["models.rag"].extend(
-        ["RagModel", "RagPreTrainedModel", "RagSequenceForGeneration", "RagTokenForGeneration"]
+        [
+            "RagModel",
+            "RagPreTrainedModel",
+            "RagSequenceForGeneration",
+            "RagTokenForGeneration",
+        ]
     )
     _import_structure["models.realm"].extend(
         [
@@ -2741,6 +3100,17 @@
             "SeamlessM4TTextToUnitModel",
         ]
     )
+    _import_structure["models.seamless_m4t_v2"].extend(
+        [
+            "SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "SeamlessM4Tv2ForSpeechToSpeech",
+            "SeamlessM4Tv2ForSpeechToText",
+            "SeamlessM4Tv2ForTextToSpeech",
+            "SeamlessM4Tv2ForTextToText",
+            "SeamlessM4Tv2Model",
+            "SeamlessM4Tv2PreTrainedModel",
+        ]
+    )
     _import_structure["models.segformer"].extend(
         [
             "SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2908,20 +3278,13 @@
         ]
     )
     _import_structure["models.timm_backbone"].extend(["TimmBackbone"])
-    _import_structure["models.transfo_xl"].extend(
+    _import_structure["models.trocr"].extend(
         [
-            "TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "AdaptiveEmbedding",
-            "TransfoXLForSequenceClassification",
-            "TransfoXLLMHeadModel",
-            "TransfoXLModel",
-            "TransfoXLPreTrainedModel",
-            "load_tf_weights_in_transfo_xl",
+            "TROCR_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TrOCRForCausalLM",
+            "TrOCRPreTrainedModel",
         ]
     )
-    _import_structure["models.trocr"].extend(
-        ["TROCR_PRETRAINED_MODEL_ARCHIVE_LIST", "TrOCRForCausalLM", "TrOCRPreTrainedModel"]
-    )
     _import_structure["models.tvlt"].extend(
         [
             "TVLT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -2931,6 +3294,14 @@
             "TvltPreTrainedModel",
         ]
     )
+    _import_structure["models.tvp"].extend(
+        [
+            "TVP_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TvpForVideoGrounding",
+            "TvpModel",
+            "TvpPreTrainedModel",
+        ]
+    )
     _import_structure["models.umt5"].extend(
         [
             "UMT5EncoderModel",
@@ -2963,6 +3334,12 @@
             "UniSpeechSatPreTrainedModel",
         ]
     )
+    _import_structure["models.univnet"].extend(
+        [
+            "UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "UnivNetModel",
+        ]
+    )
     _import_structure["models.upernet"].extend(
         [
             "UperNetForSemanticSegmentation",
@@ -2991,6 +3368,13 @@
             "ViltPreTrainedModel",
         ]
     )
+    _import_structure["models.vipllava"].extend(
+        [
+            "VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "VipLlavaForConditionalGeneration",
+            "VipLlavaPreTrainedModel",
+        ]
+    )
     _import_structure["models.vision_encoder_decoder"].extend(["VisionEncoderDecoderModel"])
     _import_structure["models.vision_text_dual_encoder"].extend(["VisionTextDualEncoderModel"])
     _import_structure["models.visual_bert"].extend(
@@ -3243,7 +3627,11 @@
         "get_polynomial_decay_schedule_with_warmup",
         "get_scheduler",
     ]
-    _import_structure["pytorch_utils"] = ["Conv1D", "apply_chunking_to_forward", "prune_layer"]
+    _import_structure["pytorch_utils"] = [
+        "Conv1D",
+        "apply_chunking_to_forward",
+        "prune_layer",
+    ]
     _import_structure["sagemaker"] = []
     _import_structure["time_series_utils"] = []
     _import_structure["trainer"] = ["Trainer"]
@@ -3356,7 +3744,12 @@
         ]
     )
     _import_structure["models.bart"].extend(
-        ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"]
+        [
+            "TFBartForConditionalGeneration",
+            "TFBartForSequenceClassification",
+            "TFBartModel",
+            "TFBartPretrainedModel",
+        ]
     )
     _import_structure["models.bert"].extend(
         [
@@ -3376,10 +3769,18 @@
         ]
     )
     _import_structure["models.blenderbot"].extend(
-        ["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel", "TFBlenderbotPreTrainedModel"]
+        [
+            "TFBlenderbotForConditionalGeneration",
+            "TFBlenderbotModel",
+            "TFBlenderbotPreTrainedModel",
+        ]
     )
     _import_structure["models.blenderbot_small"].extend(
-        ["TFBlenderbotSmallForConditionalGeneration", "TFBlenderbotSmallModel", "TFBlenderbotSmallPreTrainedModel"]
+        [
+            "TFBlenderbotSmallForConditionalGeneration",
+            "TFBlenderbotSmallModel",
+            "TFBlenderbotSmallPreTrainedModel",
+        ]
     )
     _import_structure["models.blip"].extend(
         [
@@ -3500,6 +3901,17 @@
             "TFDeiTPreTrainedModel",
         ]
     )
+    _import_structure["models.deprecated.transfo_xl"].extend(
+        [
+            "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFAdaptiveEmbedding",
+            "TFTransfoXLForSequenceClassification",
+            "TFTransfoXLLMHeadModel",
+            "TFTransfoXLMainLayer",
+            "TFTransfoXLModel",
+            "TFTransfoXLPreTrainedModel",
+        ]
+    )
     _import_structure["models.distilbert"].extend(
         [
             "TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3729,7 +4141,11 @@
         ]
     )
     _import_structure["models.pegasus"].extend(
-        ["TFPegasusForConditionalGeneration", "TFPegasusModel", "TFPegasusPreTrainedModel"]
+        [
+            "TFPegasusForConditionalGeneration",
+            "TFPegasusModel",
+            "TFPegasusPreTrainedModel",
+        ]
     )
     _import_structure["models.rag"].extend(
         [
@@ -3864,17 +4280,6 @@
             "TFTapasPreTrainedModel",
         ]
     )
-    _import_structure["models.transfo_xl"].extend(
-        [
-            "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
-            "TFAdaptiveEmbedding",
-            "TFTransfoXLForSequenceClassification",
-            "TFTransfoXLLMHeadModel",
-            "TFTransfoXLMainLayer",
-            "TFTransfoXLModel",
-            "TFTransfoXLPreTrainedModel",
-        ]
-    )
     _import_structure["models.vision_encoder_decoder"].extend(["TFVisionEncoderDecoderModel"])
     _import_structure["models.vision_text_dual_encoder"].extend(["TFVisionTextDualEncoderModel"])
     _import_structure["models.vit"].extend(
@@ -3955,7 +4360,12 @@
             "TFXLNetPreTrainedModel",
         ]
     )
-    _import_structure["optimization_tf"] = ["AdamWeightDecay", "GradientAccumulator", "WarmUp", "create_optimizer"]
+    _import_structure["optimization_tf"] = [
+        "AdamWeightDecay",
+        "GradientAccumulator",
+        "WarmUp",
+        "create_optimizer",
+    ]
     _import_structure["tf_utils"] = []
     _import_structure["trainer_tf"] = ["TFTrainer"]
 
@@ -3970,7 +4380,9 @@
     ):
         raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from .utils import dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects
+    from .utils import (
+        dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects,
+    )
 
     _import_structure["utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects"] = [
         name
@@ -4109,7 +4521,11 @@
         ]
     )
     _import_structure["models.blenderbot"].extend(
-        ["FlaxBlenderbotForConditionalGeneration", "FlaxBlenderbotModel", "FlaxBlenderbotPreTrainedModel"]
+        [
+            "FlaxBlenderbotForConditionalGeneration",
+            "FlaxBlenderbotModel",
+            "FlaxBlenderbotPreTrainedModel",
+        ]
     )
     _import_structure["models.blenderbot_small"].extend(
         [
@@ -4166,8 +4582,13 @@
         ["FlaxGPTNeoForCausalLM", "FlaxGPTNeoModel", "FlaxGPTNeoPreTrainedModel"]
     )
     _import_structure["models.gptj"].extend(["FlaxGPTJForCausalLM", "FlaxGPTJModel", "FlaxGPTJPreTrainedModel"])
+    _import_structure["models.llama"].extend(["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"])
     _import_structure["models.longt5"].extend(
-        ["FlaxLongT5ForConditionalGeneration", "FlaxLongT5Model", "FlaxLongT5PreTrainedModel"]
+        [
+            "FlaxLongT5ForConditionalGeneration",
+            "FlaxLongT5Model",
+            "FlaxLongT5PreTrainedModel",
+        ]
     )
     _import_structure["models.marian"].extend(
         [
@@ -4201,10 +4622,18 @@
         ]
     )
     _import_structure["models.regnet"].extend(
-        ["FlaxRegNetForImageClassification", "FlaxRegNetModel", "FlaxRegNetPreTrainedModel"]
+        [
+            "FlaxRegNetForImageClassification",
+            "FlaxRegNetModel",
+            "FlaxRegNetPreTrainedModel",
+        ]
     )
     _import_structure["models.resnet"].extend(
-        ["FlaxResNetForImageClassification", "FlaxResNetModel", "FlaxResNetPreTrainedModel"]
+        [
+            "FlaxResNetForImageClassification",
+            "FlaxResNetModel",
+            "FlaxResNetPreTrainedModel",
+        ]
     )
     _import_structure["models.roberta"].extend(
         [
@@ -4243,13 +4672,23 @@
     )
     _import_structure["models.speech_encoder_decoder"].append("FlaxSpeechEncoderDecoderModel")
     _import_structure["models.t5"].extend(
-        ["FlaxT5EncoderModel", "FlaxT5ForConditionalGeneration", "FlaxT5Model", "FlaxT5PreTrainedModel"]
+        [
+            "FlaxT5EncoderModel",
+            "FlaxT5ForConditionalGeneration",
+            "FlaxT5Model",
+            "FlaxT5PreTrainedModel",
+        ]
     )
     _import_structure["models.vision_encoder_decoder"].append("FlaxVisionEncoderDecoderModel")
     _import_structure["models.vision_text_dual_encoder"].extend(["FlaxVisionTextDualEncoderModel"])
     _import_structure["models.vit"].extend(["FlaxViTForImageClassification", "FlaxViTModel", "FlaxViTPreTrainedModel"])
     _import_structure["models.wav2vec2"].extend(
-        ["FlaxWav2Vec2ForCTC", "FlaxWav2Vec2ForPreTraining", "FlaxWav2Vec2Model", "FlaxWav2Vec2PreTrainedModel"]
+        [
+            "FlaxWav2Vec2ForCTC",
+            "FlaxWav2Vec2ForPreTraining",
+            "FlaxWav2Vec2Model",
+            "FlaxWav2Vec2PreTrainedModel",
+        ]
     )
     _import_structure["models.whisper"].extend(
         [
@@ -4410,13 +4849,28 @@
         WordpieceTokenizer,
     )
     from .models.bert_generation import BertGenerationConfig
-    from .models.bert_japanese import BertJapaneseTokenizer, CharacterTokenizer, MecabTokenizer
+    from .models.bert_japanese import (
+        BertJapaneseTokenizer,
+        CharacterTokenizer,
+        MecabTokenizer,
+    )
     from .models.bertweet import BertweetTokenizer
     from .models.big_bird import BIG_BIRD_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdConfig
-    from .models.bigbird_pegasus import BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, BigBirdPegasusConfig
-    from .models.biogpt import BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, BioGptConfig, BioGptTokenizer
+    from .models.bigbird_pegasus import (
+        BIGBIRD_PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BigBirdPegasusConfig,
+    )
+    from .models.biogpt import (
+        BIOGPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BioGptConfig,
+        BioGptTokenizer,
+    )
     from .models.bit import BIT_PRETRAINED_CONFIG_ARCHIVE_MAP, BitConfig
-    from .models.blenderbot import BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP, BlenderbotConfig, BlenderbotTokenizer
+    from .models.blenderbot import (
+        BLENDERBOT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BlenderbotConfig,
+        BlenderbotTokenizer,
+    )
     from .models.blenderbot_small import (
         BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
         BlenderbotSmallConfig,
@@ -4444,10 +4898,21 @@
         BridgeTowerTextConfig,
         BridgeTowerVisionConfig,
     )
-    from .models.bros import BROS_PRETRAINED_CONFIG_ARCHIVE_MAP, BrosConfig, BrosProcessor
+    from .models.bros import (
+        BROS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        BrosConfig,
+        BrosProcessor,
+    )
     from .models.byt5 import ByT5Tokenizer
-    from .models.camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CamembertConfig
-    from .models.canine import CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP, CanineConfig, CanineTokenizer
+    from .models.camembert import (
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CamembertConfig,
+    )
+    from .models.canine import (
+        CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CanineConfig,
+        CanineTokenizer,
+    )
     from .models.chinese_clip import (
         CHINESE_CLIP_PRETRAINED_CONFIG_ARCHIVE_MAP,
         ChineseCLIPConfig,
@@ -4486,13 +4951,35 @@
         ClvpProcessor,
         ClvpTokenizer,
     )
-    from .models.codegen import CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP, CodeGenConfig, CodeGenTokenizer
-    from .models.conditional_detr import CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, ConditionalDetrConfig
-    from .models.convbert import CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvBertConfig, ConvBertTokenizer
+    from .models.codegen import (
+        CODEGEN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CodeGenConfig,
+        CodeGenTokenizer,
+    )
+    from .models.conditional_detr import (
+        CONDITIONAL_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConditionalDetrConfig,
+    )
+    from .models.convbert import (
+        CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConvBertConfig,
+        ConvBertTokenizer,
+    )
     from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
-    from .models.convnextv2 import CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextV2Config
-    from .models.cpmant import CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP, CpmAntConfig, CpmAntTokenizer
-    from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
+    from .models.convnextv2 import (
+        CONVNEXTV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ConvNextV2Config,
+    )
+    from .models.cpmant import (
+        CPMANT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CpmAntConfig,
+        CpmAntTokenizer,
+    )
+    from .models.ctrl import (
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRLConfig,
+        CTRLTokenizer,
+    )
     from .models.cvt import CVT_PRETRAINED_CONFIG_ARCHIVE_MAP, CvtConfig
     from .models.data2vec import (
         DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4501,13 +4988,23 @@
         Data2VecTextConfig,
         Data2VecVisionConfig,
     )
-    from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
-    from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
+    from .models.deberta import (
+        DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DebertaConfig,
+        DebertaTokenizer,
+    )
+    from .models.deberta_v2 import (
+        DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DebertaV2Config,
+    )
     from .models.decision_transformer import (
         DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DecisionTransformerConfig,
     )
-    from .models.deformable_detr import DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DeformableDetrConfig
+    from .models.deformable_detr import (
+        DEFORMABLE_DETR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DeformableDetrConfig,
+    )
     from .models.deit import DEIT_PRETRAINED_CONFIG_ARCHIVE_MAP, DeiTConfig
     from .models.deprecated.mctct import (
         MCTCT_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4516,7 +5013,10 @@
         MCTCTProcessor,
     )
     from .models.deprecated.mmbt import MMBTConfig
-    from .models.deprecated.open_llama import OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenLlamaConfig
+    from .models.deprecated.open_llama import (
+        OPEN_LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OpenLlamaConfig,
+    )
     from .models.deprecated.retribert import (
         RETRIBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         RetriBertConfig,
@@ -4527,13 +5027,27 @@
         TRAJECTORY_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TrajectoryTransformerConfig,
     )
+    from .models.deprecated.transfo_xl import (
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TransfoXLConfig,
+        TransfoXLCorpus,
+        TransfoXLTokenizer,
+    )
     from .models.deprecated.van import VAN_PRETRAINED_CONFIG_ARCHIVE_MAP, VanConfig
     from .models.deta import DETA_PRETRAINED_CONFIG_ARCHIVE_MAP, DetaConfig
     from .models.detr import DETR_PRETRAINED_CONFIG_ARCHIVE_MAP, DetrConfig
     from .models.dinat import DINAT_PRETRAINED_CONFIG_ARCHIVE_MAP, DinatConfig
     from .models.dinov2 import DINOV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Dinov2Config
-    from .models.distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, DistilBertTokenizer
-    from .models.donut import DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, DonutProcessor, DonutSwinConfig
+    from .models.distilbert import (
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DistilBertConfig,
+        DistilBertTokenizer,
+    )
+    from .models.donut import (
+        DONUT_SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DonutProcessor,
+        DonutSwinConfig,
+    )
     from .models.dpr import (
         DPR_PRETRAINED_CONFIG_ARCHIVE_MAP,
         DPRConfig,
@@ -4543,9 +5057,19 @@
         DPRReaderTokenizer,
     )
     from .models.dpt import DPT_PRETRAINED_CONFIG_ARCHIVE_MAP, DPTConfig
-    from .models.efficientformer import EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientFormerConfig
-    from .models.efficientnet import EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP, EfficientNetConfig
-    from .models.electra import ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP, ElectraConfig, ElectraTokenizer
+    from .models.efficientformer import (
+        EFFICIENTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EfficientFormerConfig,
+    )
+    from .models.efficientnet import (
+        EFFICIENTNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        EfficientNetConfig,
+    )
+    from .models.electra import (
+        ELECTRA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ElectraConfig,
+        ElectraTokenizer,
+    )
     from .models.encodec import (
         ENCODEC_PRETRAINED_CONFIG_ARCHIVE_MAP,
         EncodecConfig,
@@ -4556,7 +5080,11 @@
     from .models.ernie_m import ERNIE_M_PRETRAINED_CONFIG_ARCHIVE_MAP, ErnieMConfig
     from .models.esm import ESM_PRETRAINED_CONFIG_ARCHIVE_MAP, EsmConfig, EsmTokenizer
     from .models.falcon import FALCON_PRETRAINED_CONFIG_ARCHIVE_MAP, FalconConfig
-    from .models.flaubert import FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, FlaubertConfig, FlaubertTokenizer
+    from .models.flaubert import (
+        FLAUBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FlaubertConfig,
+        FlaubertTokenizer,
+    )
     from .models.flava import (
         FLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         FlavaConfig,
@@ -4567,23 +5095,49 @@
     )
     from .models.fnet import FNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FNetConfig
     from .models.focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig
-    from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
-    from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
+    from .models.fsmt import (
+        FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FSMTConfig,
+        FSMTTokenizer,
+    )
+    from .models.funnel import (
+        FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        FunnelConfig,
+        FunnelTokenizer,
+    )
     from .models.fuyu import FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP, FuyuConfig
-    from .models.git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitProcessor, GitVisionConfig
+    from .models.git import (
+        GIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GitConfig,
+        GitProcessor,
+        GitVisionConfig,
+    )
     from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig
-    from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
-    from .models.gpt_bigcode import GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTBigCodeConfig
+    from .models.gpt2 import (
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2Config,
+        GPT2Tokenizer,
+    )
+    from .models.gpt_bigcode import (
+        GPT_BIGCODE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPTBigCodeConfig,
+    )
     from .models.gpt_neo import GPT_NEO_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoConfig
     from .models.gpt_neox import GPT_NEOX_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXConfig
-    from .models.gpt_neox_japanese import GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTNeoXJapaneseConfig
+    from .models.gpt_neox_japanese import (
+        GPT_NEOX_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPTNeoXJapaneseConfig,
+    )
     from .models.gptj import GPTJ_PRETRAINED_CONFIG_ARCHIVE_MAP, GPTJConfig
     from .models.gptsan_japanese import (
         GPTSAN_JAPANESE_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GPTSanJapaneseConfig,
         GPTSanJapaneseTokenizer,
     )
-    from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig
+    from .models.graphormer import (
+        GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GraphormerConfig,
+    )
     from .models.groupvit import (
         GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         GroupViTConfig,
@@ -4618,7 +5172,11 @@
         Kosmos2Config,
         Kosmos2Processor,
     )
-    from .models.layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP, LayoutLMConfig, LayoutLMTokenizer
+    from .models.layoutlm import (
+        LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LayoutLMConfig,
+        LayoutLMTokenizer,
+    )
     from .models.layoutlmv2 import (
         LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
         LayoutLMv2Config,
@@ -4640,10 +5198,26 @@
     from .models.levit import LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, LevitConfig
     from .models.lilt import LILT_PRETRAINED_CONFIG_ARCHIVE_MAP, LiltConfig
     from .models.llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
-    from .models.longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig, LongformerTokenizer
+    from .models.llava import (
+        LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LlavaConfig,
+    )
+    from .models.longformer import (
+        LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LongformerConfig,
+        LongformerTokenizer,
+    )
     from .models.longt5 import LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP, LongT5Config
-    from .models.luke import LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP, LukeConfig, LukeTokenizer
-    from .models.lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig, LxmertTokenizer
+    from .models.luke import (
+        LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LukeConfig,
+        LukeTokenizer,
+    )
+    from .models.lxmert import (
+        LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        LxmertConfig,
+        LxmertTokenizer,
+    )
     from .models.m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
     from .models.marian import MarianConfig
     from .models.markuplm import (
@@ -4653,19 +5227,55 @@
         MarkupLMProcessor,
         MarkupLMTokenizer,
     )
-    from .models.mask2former import MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Mask2FormerConfig
-    from .models.maskformer import MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, MaskFormerConfig, MaskFormerSwinConfig
+    from .models.mask2former import (
+        MASK2FORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Mask2FormerConfig,
+    )
+    from .models.maskformer import (
+        MASKFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MaskFormerConfig,
+        MaskFormerSwinConfig,
+    )
     from .models.mbart import MBartConfig
     from .models.mega import MEGA_PRETRAINED_CONFIG_ARCHIVE_MAP, MegaConfig
-    from .models.megatron_bert import MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MegatronBertConfig
-    from .models.mgp_str import MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP, MgpstrConfig, MgpstrProcessor, MgpstrTokenizer
+    from .models.megatron_bert import (
+        MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MegatronBertConfig,
+    )
+    from .models.mgp_str import (
+        MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MgpstrConfig,
+        MgpstrProcessor,
+        MgpstrTokenizer,
+    )
     from .models.mistral import MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MistralConfig
-    from .models.mobilebert import MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileBertConfig, MobileBertTokenizer
-    from .models.mobilenet_v1 import MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV1Config
-    from .models.mobilenet_v2 import MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileNetV2Config
-    from .models.mobilevit import MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTConfig
-    from .models.mobilevitv2 import MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP, MobileViTV2Config
-    from .models.mpnet import MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP, MPNetConfig, MPNetTokenizer
+    from .models.mixtral import MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MixtralConfig
+    from .models.mobilebert import (
+        MOBILEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileBertConfig,
+        MobileBertTokenizer,
+    )
+    from .models.mobilenet_v1 import (
+        MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileNetV1Config,
+    )
+    from .models.mobilenet_v2 import (
+        MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileNetV2Config,
+    )
+    from .models.mobilevit import (
+        MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileViTConfig,
+    )
+    from .models.mobilevitv2 import (
+        MOBILEVITV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MobileViTV2Config,
+    )
+    from .models.mpnet import (
+        MPNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        MPNetConfig,
+        MPNetTokenizer,
+    )
     from .models.mpt import MPT_PRETRAINED_CONFIG_ARCHIVE_MAP, MptConfig
     from .models.mra import MRA_PRETRAINED_CONFIG_ARCHIVE_MAP, MraConfig
     from .models.mt5 import MT5Config
@@ -4679,9 +5289,20 @@
     from .models.nezha import NEZHA_PRETRAINED_CONFIG_ARCHIVE_MAP, NezhaConfig
     from .models.nllb_moe import NLLB_MOE_PRETRAINED_CONFIG_ARCHIVE_MAP, NllbMoeConfig
     from .models.nougat import NougatProcessor
-    from .models.nystromformer import NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, NystromformerConfig
-    from .models.oneformer import ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, OneFormerConfig, OneFormerProcessor
-    from .models.openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, OpenAIGPTTokenizer
+    from .models.nystromformer import (
+        NYSTROMFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        NystromformerConfig,
+    )
+    from .models.oneformer import (
+        ONEFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OneFormerConfig,
+        OneFormerProcessor,
+    )
+    from .models.openai import (
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OpenAIGPTConfig,
+        OpenAIGPTTokenizer,
+    )
     from .models.opt import OPTConfig
     from .models.owlv2 import (
         OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4697,10 +5318,29 @@
         OwlViTTextConfig,
         OwlViTVisionConfig,
     )
-    from .models.pegasus import PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusConfig, PegasusTokenizer
-    from .models.pegasus_x import PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP, PegasusXConfig
-    from .models.perceiver import PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP, PerceiverConfig, PerceiverTokenizer
-    from .models.persimmon import PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP, PersimmonConfig
+    from .models.patchtsmixer import (
+        PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PatchTSMixerConfig,
+    )
+    from .models.patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
+    from .models.pegasus import (
+        PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PegasusConfig,
+        PegasusTokenizer,
+    )
+    from .models.pegasus_x import (
+        PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PegasusXConfig,
+    )
+    from .models.perceiver import (
+        PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PerceiverConfig,
+        PerceiverTokenizer,
+    )
+    from .models.persimmon import (
+        PERSIMMON_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PersimmonConfig,
+    )
     from .models.phi import PHI_PRETRAINED_CONFIG_ARCHIVE_MAP, PhiConfig
     from .models.phobert import PhobertTokenizer
     from .models.pix2struct import (
@@ -4711,27 +5351,50 @@
         Pix2StructVisionConfig,
     )
     from .models.plbart import PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP, PLBartConfig
-    from .models.poolformer import POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, PoolFormerConfig
+    from .models.poolformer import (
+        POOLFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PoolFormerConfig,
+    )
     from .models.pop2piano import (
         POP2PIANO_PRETRAINED_CONFIG_ARCHIVE_MAP,
         Pop2PianoConfig,
     )
-    from .models.prophetnet import PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ProphetNetConfig, ProphetNetTokenizer
+    from .models.prophetnet import (
+        PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ProphetNetConfig,
+        ProphetNetTokenizer,
+    )
     from .models.pvt import PVT_PRETRAINED_CONFIG_ARCHIVE_MAP, PvtConfig
     from .models.qdqbert import QDQBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, QDQBertConfig
     from .models.rag import RagConfig, RagRetriever, RagTokenizer
-    from .models.realm import REALM_PRETRAINED_CONFIG_ARCHIVE_MAP, RealmConfig, RealmTokenizer
+    from .models.realm import (
+        REALM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RealmConfig,
+        RealmTokenizer,
+    )
     from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig
     from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig
     from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig
     from .models.resnet import RESNET_PRETRAINED_CONFIG_ARCHIVE_MAP, ResNetConfig
-    from .models.roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, RobertaTokenizer
+    from .models.roberta import (
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RobertaConfig,
+        RobertaTokenizer,
+    )
     from .models.roberta_prelayernorm import (
         ROBERTA_PRELAYERNORM_PRETRAINED_CONFIG_ARCHIVE_MAP,
         RobertaPreLayerNormConfig,
     )
-    from .models.roc_bert import ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RoCBertConfig, RoCBertTokenizer
-    from .models.roformer import ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, RoFormerConfig, RoFormerTokenizer
+    from .models.roc_bert import (
+        ROC_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RoCBertConfig,
+        RoCBertTokenizer,
+    )
+    from .models.roformer import (
+        ROFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        RoFormerConfig,
+        RoFormerTokenizer,
+    )
     from .models.rwkv import RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP, RwkvConfig
     from .models.sam import (
         SAM_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -4747,7 +5410,14 @@
         SeamlessM4TFeatureExtractor,
         SeamlessM4TProcessor,
     )
-    from .models.segformer import SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SegformerConfig
+    from .models.seamless_m4t_v2 import (
+        SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SeamlessM4Tv2Config,
+    )
+    from .models.segformer import (
+        SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SegformerConfig,
+    )
     from .models.sew import SEW_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWConfig
     from .models.sew_d import SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP, SEWDConfig
     from .models.speech_encoder_decoder import SpeechEncoderDecoderConfig
@@ -4771,33 +5441,76 @@
         SpeechT5HifiGanConfig,
         SpeechT5Processor,
     )
-    from .models.splinter import SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP, SplinterConfig, SplinterTokenizer
-    from .models.squeezebert import SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, SqueezeBertConfig, SqueezeBertTokenizer
-    from .models.swiftformer import SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, SwiftFormerConfig
+    from .models.splinter import (
+        SPLINTER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SplinterConfig,
+        SplinterTokenizer,
+    )
+    from .models.squeezebert import (
+        SQUEEZEBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SqueezeBertConfig,
+        SqueezeBertTokenizer,
+    )
+    from .models.swiftformer import (
+        SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SwiftFormerConfig,
+    )
     from .models.swin import SWIN_PRETRAINED_CONFIG_ARCHIVE_MAP, SwinConfig
     from .models.swin2sr import SWIN2SR_PRETRAINED_CONFIG_ARCHIVE_MAP, Swin2SRConfig
     from .models.swinv2 import SWINV2_PRETRAINED_CONFIG_ARCHIVE_MAP, Swinv2Config
-    from .models.switch_transformers import SWITCH_TRANSFORMERS_PRETRAINED_CONFIG_ARCHIVE_MAP, SwitchTransformersConfig
+    from .models.switch_transformers import (
+        SWITCH_TRANSFORMERS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        SwitchTransformersConfig,
+    )
     from .models.t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config
-    from .models.table_transformer import TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TableTransformerConfig
-    from .models.tapas import TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP, TapasConfig, TapasTokenizer
+    from .models.table_transformer import (
+        TABLE_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TableTransformerConfig,
+    )
+    from .models.tapas import (
+        TAPAS_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TapasConfig,
+        TapasTokenizer,
+    )
     from .models.time_series_transformer import (
         TIME_SERIES_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
         TimeSeriesTransformerConfig,
     )
-    from .models.timesformer import TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, TimesformerConfig
+    from .models.timesformer import (
+        TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TimesformerConfig,
+    )
     from .models.timm_backbone import TimmBackboneConfig
-    from .models.transfo_xl import (
-        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TransfoXLConfig,
-        TransfoXLCorpus,
-        TransfoXLTokenizer,
+    from .models.trocr import (
+        TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TrOCRConfig,
+        TrOCRProcessor,
+    )
+    from .models.tvlt import (
+        TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TvltConfig,
+        TvltFeatureExtractor,
+        TvltProcessor,
+    )
+    from .models.tvp import (
+        TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TvpConfig,
+        TvpProcessor,
     )
-    from .models.trocr import TROCR_PRETRAINED_CONFIG_ARCHIVE_MAP, TrOCRConfig, TrOCRProcessor
-    from .models.tvlt import TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP, TvltConfig, TvltFeatureExtractor, TvltProcessor
     from .models.umt5 import UMT5Config
-    from .models.unispeech import UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechConfig
-    from .models.unispeech_sat import UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP, UniSpeechSatConfig
+    from .models.unispeech import (
+        UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        UniSpeechConfig,
+    )
+    from .models.unispeech_sat import (
+        UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        UniSpeechSatConfig,
+    )
+    from .models.univnet import (
+        UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        UnivNetConfig,
+        UnivNetFeatureExtractor,
+    )
     from .models.upernet import UperNetConfig
     from .models.videomae import VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP, VideoMAEConfig
     from .models.vilt import (
@@ -4807,11 +5520,24 @@
         ViltImageProcessor,
         ViltProcessor,
     )
+    from .models.vipllava import (
+        VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        VipLlavaConfig,
+    )
     from .models.vision_encoder_decoder import VisionEncoderDecoderConfig
-    from .models.vision_text_dual_encoder import VisionTextDualEncoderConfig, VisionTextDualEncoderProcessor
-    from .models.visual_bert import VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, VisualBertConfig
+    from .models.vision_text_dual_encoder import (
+        VisionTextDualEncoderConfig,
+        VisionTextDualEncoderProcessor,
+    )
+    from .models.visual_bert import (
+        VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        VisualBertConfig,
+    )
     from .models.vit import VIT_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTConfig
-    from .models.vit_hybrid import VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTHybridConfig
+    from .models.vit_hybrid import (
+        VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ViTHybridConfig,
+    )
     from .models.vit_mae import VIT_MAE_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMAEConfig
     from .models.vit_msn import VIT_MSN_PRETRAINED_CONFIG_ARCHIVE_MAP, ViTMSNConfig
     from .models.vitdet import VITDET_PRETRAINED_CONFIG_ARCHIVE_MAP, VitDetConfig
@@ -4830,7 +5556,10 @@
         Wav2Vec2Processor,
         Wav2Vec2Tokenizer,
     )
-    from .models.wav2vec2_conformer import WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, Wav2Vec2ConformerConfig
+    from .models.wav2vec2_conformer import (
+        WAV2VEC2_CONFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        Wav2Vec2ConformerConfig,
+    )
     from .models.wav2vec2_phoneme import Wav2Vec2PhonemeCTCTokenizer
     from .models.wav2vec2_with_lm import Wav2Vec2ProcessorWithLM
     from .models.wavlm import WAVLM_PRETRAINED_CONFIG_ARCHIVE_MAP, WavLMConfig
@@ -4850,9 +5579,18 @@
     )
     from .models.xglm import XGLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XGLMConfig
     from .models.xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMConfig, XLMTokenizer
-    from .models.xlm_prophetnet import XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMProphetNetConfig
-    from .models.xlm_roberta import XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaConfig
-    from .models.xlm_roberta_xl import XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, XLMRobertaXLConfig
+    from .models.xlm_prophetnet import (
+        XLM_PROPHETNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLMProphetNetConfig,
+    )
+    from .models.xlm_roberta import (
+        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLMRobertaConfig,
+    )
+    from .models.xlm_roberta_xl import (
+        XLM_ROBERTA_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLMRobertaXLConfig,
+    )
     from .models.xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNetConfig
     from .models.xmod import XMOD_PRETRAINED_CONFIG_ARCHIVE_MAP, XmodConfig
     from .models.yolos import YOLOS_PRETRAINED_CONFIG_ARCHIVE_MAP, YolosConfig
@@ -4934,7 +5672,13 @@
         TrainerControl,
         TrainerState,
     )
-    from .trainer_utils import EvalPrediction, IntervalStrategy, SchedulerType, enable_full_determinism, set_seed
+    from .trainer_utils import (
+        EvalPrediction,
+        IntervalStrategy,
+        SchedulerType,
+        enable_full_determinism,
+        set_seed,
+    )
     from .training_args import TrainingArguments
     from .training_args_seq2seq import Seq2SeqTrainingArguments
     from .training_args_tf import TFTrainingArguments
@@ -5050,7 +5794,11 @@
         from .models.deberta_v2 import DebertaV2TokenizerFast
         from .models.deprecated.retribert import RetriBertTokenizerFast
         from .models.distilbert import DistilBertTokenizerFast
-        from .models.dpr import DPRContextEncoderTokenizerFast, DPRQuestionEncoderTokenizerFast, DPRReaderTokenizerFast
+        from .models.dpr import (
+            DPRContextEncoderTokenizerFast,
+            DPRQuestionEncoderTokenizerFast,
+            DPRReaderTokenizerFast,
+        )
         from .models.electra import ElectraTokenizerFast
         from .models.fnet import FNetTokenizerFast
         from .models.funnel import FunnelTokenizerFast
@@ -5098,7 +5846,10 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummies_sentencepiece_and_tokenizers_objects import *
     else:
-        from .convert_slow_tokenizer import SLOW_TO_FAST_CONVERTERS, convert_slow_tokenizer
+        from .convert_slow_tokenizer import (
+            SLOW_TO_FAST_CONVERTERS,
+            convert_slow_tokenizer,
+        )
 
     try:
         if not is_tensorflow_text_available():
@@ -5128,11 +5879,20 @@
         from .models.bit import BitImageProcessor
         from .models.blip import BlipImageProcessor
         from .models.bridgetower import BridgeTowerImageProcessor
-        from .models.chinese_clip import ChineseCLIPFeatureExtractor, ChineseCLIPImageProcessor
+        from .models.chinese_clip import (
+            ChineseCLIPFeatureExtractor,
+            ChineseCLIPImageProcessor,
+        )
         from .models.clip import CLIPFeatureExtractor, CLIPImageProcessor
-        from .models.conditional_detr import ConditionalDetrFeatureExtractor, ConditionalDetrImageProcessor
+        from .models.conditional_detr import (
+            ConditionalDetrFeatureExtractor,
+            ConditionalDetrImageProcessor,
+        )
         from .models.convnext import ConvNextFeatureExtractor, ConvNextImageProcessor
-        from .models.deformable_detr import DeformableDetrFeatureExtractor, DeformableDetrImageProcessor
+        from .models.deformable_detr import (
+            DeformableDetrFeatureExtractor,
+            DeformableDetrImageProcessor,
+        )
         from .models.deit import DeiTFeatureExtractor, DeiTImageProcessor
         from .models.deta import DetaImageProcessor
         from .models.detr import DetrFeatureExtractor, DetrImageProcessor
@@ -5140,18 +5900,37 @@
         from .models.dpt import DPTFeatureExtractor, DPTImageProcessor
         from .models.efficientformer import EfficientFormerImageProcessor
         from .models.efficientnet import EfficientNetImageProcessor
-        from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
+        from .models.flava import (
+            FlavaFeatureExtractor,
+            FlavaImageProcessor,
+            FlavaProcessor,
+        )
         from .models.fuyu import FuyuImageProcessor, FuyuProcessor
         from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
         from .models.idefics import IdeficsImageProcessor
         from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
-        from .models.layoutlmv2 import LayoutLMv2FeatureExtractor, LayoutLMv2ImageProcessor
-        from .models.layoutlmv3 import LayoutLMv3FeatureExtractor, LayoutLMv3ImageProcessor
+        from .models.layoutlmv2 import (
+            LayoutLMv2FeatureExtractor,
+            LayoutLMv2ImageProcessor,
+        )
+        from .models.layoutlmv3 import (
+            LayoutLMv3FeatureExtractor,
+            LayoutLMv3ImageProcessor,
+        )
         from .models.levit import LevitFeatureExtractor, LevitImageProcessor
         from .models.mask2former import Mask2FormerImageProcessor
-        from .models.maskformer import MaskFormerFeatureExtractor, MaskFormerImageProcessor
-        from .models.mobilenet_v1 import MobileNetV1FeatureExtractor, MobileNetV1ImageProcessor
-        from .models.mobilenet_v2 import MobileNetV2FeatureExtractor, MobileNetV2ImageProcessor
+        from .models.maskformer import (
+            MaskFormerFeatureExtractor,
+            MaskFormerImageProcessor,
+        )
+        from .models.mobilenet_v1 import (
+            MobileNetV1FeatureExtractor,
+            MobileNetV1ImageProcessor,
+        )
+        from .models.mobilenet_v2 import (
+            MobileNetV2FeatureExtractor,
+            MobileNetV2ImageProcessor,
+        )
         from .models.mobilevit import MobileViTFeatureExtractor, MobileViTImageProcessor
         from .models.nougat import NougatImageProcessor
         from .models.oneformer import OneFormerImageProcessor
@@ -5159,12 +5938,16 @@
         from .models.owlvit import OwlViTFeatureExtractor, OwlViTImageProcessor
         from .models.perceiver import PerceiverFeatureExtractor, PerceiverImageProcessor
         from .models.pix2struct import Pix2StructImageProcessor
-        from .models.poolformer import PoolFormerFeatureExtractor, PoolFormerImageProcessor
+        from .models.poolformer import (
+            PoolFormerFeatureExtractor,
+            PoolFormerImageProcessor,
+        )
         from .models.pvt import PvtImageProcessor
         from .models.sam import SamImageProcessor
         from .models.segformer import SegformerFeatureExtractor, SegformerImageProcessor
         from .models.swin2sr import Swin2SRImageProcessor
         from .models.tvlt import TvltImageProcessor
+        from .models.tvp import TvpImageProcessor
         from .models.videomae import VideoMAEFeatureExtractor, VideoMAEImageProcessor
         from .models.vilt import ViltFeatureExtractor, ViltImageProcessor, ViltProcessor
         from .models.vit import ViTFeatureExtractor, ViTImageProcessor
@@ -5183,6 +5966,7 @@
         # Benchmarks
         from .benchmark.benchmark import PyTorchBenchmark
         from .benchmark.benchmark_args import PyTorchBenchmarkArguments
+        from .cache_utils import Cache, DynamicCache, SinkCache
         from .data.datasets import (
             GlueDataset,
             GlueDataTrainingArguments,
@@ -5303,6 +6087,8 @@
             MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
             MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
@@ -5378,6 +6164,7 @@
         )
         from .models.beit import (
             BEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BeitBackbone,
             BeitForImageClassification,
             BeitForMaskedImageModeling,
             BeitForSemanticSegmentation,
@@ -5693,7 +6480,11 @@
             MCTCTModel,
             MCTCTPreTrainedModel,
         )
-        from .models.deprecated.mmbt import MMBTForClassification, MMBTModel, ModalEmbeddings
+        from .models.deprecated.mmbt import (
+            MMBTForClassification,
+            MMBTModel,
+            ModalEmbeddings,
+        )
         from .models.deprecated.open_llama import (
             OpenLlamaForCausalLM,
             OpenLlamaForSequenceClassification,
@@ -5710,6 +6501,15 @@
             TrajectoryTransformerModel,
             TrajectoryTransformerPreTrainedModel,
         )
+        from .models.deprecated.transfo_xl import (
+            TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
+            AdaptiveEmbedding,
+            TransfoXLForSequenceClassification,
+            TransfoXLLMHeadModel,
+            TransfoXLModel,
+            TransfoXLPreTrainedModel,
+            load_tf_weights_in_transfo_xl,
+        )
         from .models.deprecated.van import (
             VAN_PRETRAINED_MODEL_ARCHIVE_LIST,
             VanForImageClassification,
@@ -5753,7 +6553,11 @@
             DistilBertModel,
             DistilBertPreTrainedModel,
         )
-        from .models.donut import DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST, DonutSwinModel, DonutSwinPreTrainedModel
+        from .models.donut import (
+            DONUT_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
+            DonutSwinModel,
+            DonutSwinPreTrainedModel,
+        )
         from .models.dpr import (
             DPR_CONTEXT_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
             DPR_QUESTION_ENCODER_PRETRAINED_MODEL_ARCHIVE_LIST,
@@ -5889,7 +6693,11 @@
             FocalNetModel,
             FocalNetPreTrainedModel,
         )
-        from .models.fsmt import FSMTForConditionalGeneration, FSMTModel, PretrainedFSMTModel
+        from .models.fsmt import (
+            FSMTForConditionalGeneration,
+            FSMTModel,
+            PretrainedFSMTModel,
+        )
         from .models.funnel import (
             FUNNEL_PRETRAINED_MODEL_ARCHIVE_LIST,
             FunnelBaseModel,
@@ -6100,6 +6908,12 @@
             LiltPreTrainedModel,
         )
         from .models.llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
+        from .models.llava import (
+            LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            LlavaForConditionalGeneration,
+            LlavaPreTrainedModel,
+            LlavaProcessor,
+        )
         from .models.longformer import (
             LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             LongformerForMaskedLM,
@@ -6212,6 +7026,12 @@
             MistralModel,
             MistralPreTrainedModel,
         )
+        from .models.mixtral import (
+            MixtralForCausalLM,
+            MixtralForSequenceClassification,
+            MixtralModel,
+            MixtralPreTrainedModel,
+        )
         from .models.mobilebert import (
             MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             MobileBertForMaskedLM,
@@ -6387,6 +7207,24 @@
             OwlViTTextModel,
             OwlViTVisionModel,
         )
+        from .models.patchtsmixer import (
+            PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSMixerForPrediction,
+            PatchTSMixerForPretraining,
+            PatchTSMixerForRegression,
+            PatchTSMixerForTimeSeriesClassification,
+            PatchTSMixerModel,
+            PatchTSMixerPreTrainedModel,
+        )
+        from .models.patchtst import (
+            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSTForClassification,
+            PatchTSTForPrediction,
+            PatchTSTForPretraining,
+            PatchTSTForRegression,
+            PatchTSTModel,
+            PatchTSTPreTrainedModel,
+        )
         from .models.pegasus import (
             PegasusForCausalLM,
             PegasusForConditionalGeneration,
@@ -6481,7 +7319,12 @@
             QDQBertPreTrainedModel,
             load_tf_weights_in_qdqbert,
         )
-        from .models.rag import RagModel, RagPreTrainedModel, RagSequenceForGeneration, RagTokenForGeneration
+        from .models.rag import (
+            RagModel,
+            RagPreTrainedModel,
+            RagSequenceForGeneration,
+            RagTokenForGeneration,
+        )
         from .models.realm import (
             REALM_PRETRAINED_MODEL_ARCHIVE_LIST,
             RealmEmbedder,
@@ -6605,6 +7448,15 @@
             SeamlessM4TTextToUnitForConditionalGeneration,
             SeamlessM4TTextToUnitModel,
         )
+        from .models.seamless_m4t_v2 import (
+            SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SeamlessM4Tv2ForSpeechToSpeech,
+            SeamlessM4Tv2ForSpeechToText,
+            SeamlessM4Tv2ForTextToSpeech,
+            SeamlessM4Tv2ForTextToText,
+            SeamlessM4Tv2Model,
+            SeamlessM4Tv2PreTrainedModel,
+        )
         from .models.segformer import (
             SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             SegformerDecodeHead,
@@ -6635,7 +7487,10 @@
             Speech2TextModel,
             Speech2TextPreTrainedModel,
         )
-        from .models.speech_to_text_2 import Speech2Text2ForCausalLM, Speech2Text2PreTrainedModel
+        from .models.speech_to_text_2 import (
+            Speech2Text2ForCausalLM,
+            Speech2Text2PreTrainedModel,
+        )
         from .models.speecht5 import (
             SPEECHT5_PRETRAINED_MODEL_ARCHIVE_LIST,
             SpeechT5ForSpeechToSpeech,
@@ -6738,16 +7593,11 @@
             TimesformerPreTrainedModel,
         )
         from .models.timm_backbone import TimmBackbone
-        from .models.transfo_xl import (
-            TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
-            AdaptiveEmbedding,
-            TransfoXLForSequenceClassification,
-            TransfoXLLMHeadModel,
-            TransfoXLModel,
-            TransfoXLPreTrainedModel,
-            load_tf_weights_in_transfo_xl,
+        from .models.trocr import (
+            TROCR_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TrOCRForCausalLM,
+            TrOCRPreTrainedModel,
         )
-        from .models.trocr import TROCR_PRETRAINED_MODEL_ARCHIVE_LIST, TrOCRForCausalLM, TrOCRPreTrainedModel
         from .models.tvlt import (
             TVLT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TvltForAudioVisualClassification,
@@ -6755,6 +7605,12 @@
             TvltModel,
             TvltPreTrainedModel,
         )
+        from .models.tvp import (
+            TVP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvpForVideoGrounding,
+            TvpModel,
+            TvpPreTrainedModel,
+        )
         from .models.umt5 import (
             UMT5EncoderModel,
             UMT5ForConditionalGeneration,
@@ -6781,7 +7637,11 @@
             UniSpeechSatModel,
             UniSpeechSatPreTrainedModel,
         )
-        from .models.upernet import UperNetForSemanticSegmentation, UperNetPreTrainedModel
+        from .models.univnet import UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST, UnivNetModel
+        from .models.upernet import (
+            UperNetForSemanticSegmentation,
+            UperNetPreTrainedModel,
+        )
         from .models.videomae import (
             VIDEOMAE_PRETRAINED_MODEL_ARCHIVE_LIST,
             VideoMAEForPreTraining,
@@ -6800,6 +7660,11 @@
             ViltModel,
             ViltPreTrainedModel,
         )
+        from .models.vipllava import (
+            VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VipLlavaForConditionalGeneration,
+            VipLlavaPreTrainedModel,
+        )
         from .models.vision_encoder_decoder import VisionEncoderDecoderModel
         from .models.vision_text_dual_encoder import VisionTextDualEncoderModel
         from .models.visual_bert import (
@@ -6906,7 +7771,12 @@
             XCLIPTextModel,
             XCLIPVisionModel,
         )
-        from .models.xglm import XGLM_PRETRAINED_MODEL_ARCHIVE_LIST, XGLMForCausalLM, XGLMModel, XGLMPreTrainedModel
+        from .models.xglm import (
+            XGLM_PRETRAINED_MODEL_ARCHIVE_LIST,
+            XGLMForCausalLM,
+            XGLMModel,
+            XGLMPreTrainedModel,
+        )
         from .models.xlm import (
             XLM_PRETRAINED_MODEL_ARCHIVE_LIST,
             XLMForMultipleChoice,
@@ -7043,7 +7913,12 @@
             tf_top_k_top_p_filtering,
         )
         from .keras_callbacks import KerasMetricCallback, PushToHubCallback
-        from .modeling_tf_utils import TFPreTrainedModel, TFSequenceSummary, TFSharedEmbeddings, shape_list
+        from .modeling_tf_utils import (
+            TFPreTrainedModel,
+            TFSequenceSummary,
+            TFSharedEmbeddings,
+            shape_list,
+        )
 
         # TensorFlow model imports
         from .models.albert import (
@@ -7174,7 +8049,11 @@
             TFConvBertModel,
             TFConvBertPreTrainedModel,
         )
-        from .models.convnext import TFConvNextForImageClassification, TFConvNextModel, TFConvNextPreTrainedModel
+        from .models.convnext import (
+            TFConvNextForImageClassification,
+            TFConvNextModel,
+            TFConvNextPreTrainedModel,
+        )
         from .models.convnextv2 import (
             TFConvNextV2ForImageClassification,
             TFConvNextV2Model,
@@ -7226,6 +8105,15 @@
             TFDeiTModel,
             TFDeiTPreTrainedModel,
         )
+        from .models.deprecated.transfo_xl import (
+            TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFAdaptiveEmbedding,
+            TFTransfoXLForSequenceClassification,
+            TFTransfoXLLMHeadModel,
+            TFTransfoXLMainLayer,
+            TFTransfoXLModel,
+            TFTransfoXLPreTrainedModel,
+        )
         from .models.distilbert import (
             TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFDistilBertForMaskedLM,
@@ -7344,7 +8232,11 @@
             TFLayoutLMv3Model,
             TFLayoutLMv3PreTrainedModel,
         )
-        from .models.led import TFLEDForConditionalGeneration, TFLEDModel, TFLEDPreTrainedModel
+        from .models.led import (
+            TFLEDForConditionalGeneration,
+            TFLEDModel,
+            TFLEDPreTrainedModel,
+        )
         from .models.longformer import (
             TF_LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFLongformerForMaskedLM,
@@ -7364,8 +8256,16 @@
             TFLxmertPreTrainedModel,
             TFLxmertVisualFeatureEncoder,
         )
-        from .models.marian import TFMarianModel, TFMarianMTModel, TFMarianPreTrainedModel
-        from .models.mbart import TFMBartForConditionalGeneration, TFMBartModel, TFMBartPreTrainedModel
+        from .models.marian import (
+            TFMarianModel,
+            TFMarianMTModel,
+            TFMarianPreTrainedModel,
+        )
+        from .models.mbart import (
+            TFMBartForConditionalGeneration,
+            TFMBartModel,
+            TFMBartPreTrainedModel,
+        )
         from .models.mobilebert import (
             TF_MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFMobileBertForMaskedLM,
@@ -7397,7 +8297,11 @@
             TFMPNetModel,
             TFMPNetPreTrainedModel,
         )
-        from .models.mt5 import TFMT5EncoderModel, TFMT5ForConditionalGeneration, TFMT5Model
+        from .models.mt5 import (
+            TFMT5EncoderModel,
+            TFMT5ForConditionalGeneration,
+            TFMT5Model,
+        )
         from .models.openai import (
             TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFOpenAIGPTDoubleHeadsModel,
@@ -7408,8 +8312,17 @@
             TFOpenAIGPTPreTrainedModel,
         )
         from .models.opt import TFOPTForCausalLM, TFOPTModel, TFOPTPreTrainedModel
-        from .models.pegasus import TFPegasusForConditionalGeneration, TFPegasusModel, TFPegasusPreTrainedModel
-        from .models.rag import TFRagModel, TFRagPreTrainedModel, TFRagSequenceForGeneration, TFRagTokenForGeneration
+        from .models.pegasus import (
+            TFPegasusForConditionalGeneration,
+            TFPegasusModel,
+            TFPegasusPreTrainedModel,
+        )
+        from .models.rag import (
+            TFRagModel,
+            TFRagPreTrainedModel,
+            TFRagSequenceForGeneration,
+            TFRagTokenForGeneration,
+        )
         from .models.regnet import (
             TF_REGNET_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFRegNetForImageClassification,
@@ -7511,19 +8424,18 @@
             TFTapasModel,
             TFTapasPreTrainedModel,
         )
-        from .models.transfo_xl import (
-            TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
-            TFAdaptiveEmbedding,
-            TFTransfoXLForSequenceClassification,
-            TFTransfoXLLMHeadModel,
-            TFTransfoXLMainLayer,
-            TFTransfoXLModel,
-            TFTransfoXLPreTrainedModel,
-        )
         from .models.vision_encoder_decoder import TFVisionEncoderDecoderModel
         from .models.vision_text_dual_encoder import TFVisionTextDualEncoderModel
-        from .models.vit import TFViTForImageClassification, TFViTModel, TFViTPreTrainedModel
-        from .models.vit_mae import TFViTMAEForPreTraining, TFViTMAEModel, TFViTMAEPreTrainedModel
+        from .models.vit import (
+            TFViTForImageClassification,
+            TFViTModel,
+            TFViTPreTrainedModel,
+        )
+        from .models.vit_mae import (
+            TFViTMAEForPreTraining,
+            TFViTMAEModel,
+            TFViTMAEPreTrainedModel,
+        )
         from .models.wav2vec2 import (
             TF_WAV_2_VEC_2_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFWav2Vec2ForCTC,
@@ -7578,7 +8490,12 @@
         )
 
         # Optimization
-        from .optimization_tf import AdamWeightDecay, GradientAccumulator, WarmUp, create_optimizer
+        from .optimization_tf import (
+            AdamWeightDecay,
+            GradientAccumulator,
+            WarmUp,
+            create_optimizer,
+        )
 
         # Trainer
         from .trainer_tf import TFTrainer
@@ -7595,7 +8512,11 @@
     except OptionalDependencyNotAvailable:
         from .utils.dummy_essentia_and_librosa_and_pretty_midi_and_scipy_and_torch_objects import *
     else:
-        from .models.pop2piano import Pop2PianoFeatureExtractor, Pop2PianoProcessor, Pop2PianoTokenizer
+        from .models.pop2piano import (
+            Pop2PianoFeatureExtractor,
+            Pop2PianoProcessor,
+            Pop2PianoTokenizer,
+        )
 
     try:
         if not is_flax_available():
@@ -7711,7 +8632,11 @@
             FlaxBlenderbotSmallModel,
             FlaxBlenderbotSmallPreTrainedModel,
         )
-        from .models.bloom import FlaxBloomForCausalLM, FlaxBloomModel, FlaxBloomPreTrainedModel
+        from .models.bloom import (
+            FlaxBloomForCausalLM,
+            FlaxBloomModel,
+            FlaxBloomPreTrainedModel,
+        )
         from .models.clip import (
             FlaxCLIPModel,
             FlaxCLIPPreTrainedModel,
@@ -7742,11 +8667,36 @@
             FlaxElectraPreTrainedModel,
         )
         from .models.encoder_decoder import FlaxEncoderDecoderModel
-        from .models.gpt2 import FlaxGPT2LMHeadModel, FlaxGPT2Model, FlaxGPT2PreTrainedModel
-        from .models.gpt_neo import FlaxGPTNeoForCausalLM, FlaxGPTNeoModel, FlaxGPTNeoPreTrainedModel
-        from .models.gptj import FlaxGPTJForCausalLM, FlaxGPTJModel, FlaxGPTJPreTrainedModel
-        from .models.longt5 import FlaxLongT5ForConditionalGeneration, FlaxLongT5Model, FlaxLongT5PreTrainedModel
-        from .models.marian import FlaxMarianModel, FlaxMarianMTModel, FlaxMarianPreTrainedModel
+        from .models.gpt2 import (
+            FlaxGPT2LMHeadModel,
+            FlaxGPT2Model,
+            FlaxGPT2PreTrainedModel,
+        )
+        from .models.gpt_neo import (
+            FlaxGPTNeoForCausalLM,
+            FlaxGPTNeoModel,
+            FlaxGPTNeoPreTrainedModel,
+        )
+        from .models.gptj import (
+            FlaxGPTJForCausalLM,
+            FlaxGPTJModel,
+            FlaxGPTJPreTrainedModel,
+        )
+        from .models.llama import (
+            FlaxLlamaForCausalLM,
+            FlaxLlamaModel,
+            FlaxLlamaPreTrainedModel,
+        )
+        from .models.longt5 import (
+            FlaxLongT5ForConditionalGeneration,
+            FlaxLongT5Model,
+            FlaxLongT5PreTrainedModel,
+        )
+        from .models.marian import (
+            FlaxMarianModel,
+            FlaxMarianMTModel,
+            FlaxMarianPreTrainedModel,
+        )
         from .models.mbart import (
             FlaxMBartForConditionalGeneration,
             FlaxMBartForQuestionAnswering,
@@ -7754,11 +8704,27 @@
             FlaxMBartModel,
             FlaxMBartPreTrainedModel,
         )
-        from .models.mt5 import FlaxMT5EncoderModel, FlaxMT5ForConditionalGeneration, FlaxMT5Model
+        from .models.mt5 import (
+            FlaxMT5EncoderModel,
+            FlaxMT5ForConditionalGeneration,
+            FlaxMT5Model,
+        )
         from .models.opt import FlaxOPTForCausalLM, FlaxOPTModel, FlaxOPTPreTrainedModel
-        from .models.pegasus import FlaxPegasusForConditionalGeneration, FlaxPegasusModel, FlaxPegasusPreTrainedModel
-        from .models.regnet import FlaxRegNetForImageClassification, FlaxRegNetModel, FlaxRegNetPreTrainedModel
-        from .models.resnet import FlaxResNetForImageClassification, FlaxResNetModel, FlaxResNetPreTrainedModel
+        from .models.pegasus import (
+            FlaxPegasusForConditionalGeneration,
+            FlaxPegasusModel,
+            FlaxPegasusPreTrainedModel,
+        )
+        from .models.regnet import (
+            FlaxRegNetForImageClassification,
+            FlaxRegNetModel,
+            FlaxRegNetPreTrainedModel,
+        )
+        from .models.resnet import (
+            FlaxResNetForImageClassification,
+            FlaxResNetModel,
+            FlaxResNetPreTrainedModel,
+        )
         from .models.roberta import (
             FlaxRobertaForCausalLM,
             FlaxRobertaForMaskedLM,
@@ -7789,10 +8755,19 @@
             FlaxRoFormerPreTrainedModel,
         )
         from .models.speech_encoder_decoder import FlaxSpeechEncoderDecoderModel
-        from .models.t5 import FlaxT5EncoderModel, FlaxT5ForConditionalGeneration, FlaxT5Model, FlaxT5PreTrainedModel
+        from .models.t5 import (
+            FlaxT5EncoderModel,
+            FlaxT5ForConditionalGeneration,
+            FlaxT5Model,
+            FlaxT5PreTrainedModel,
+        )
         from .models.vision_encoder_decoder import FlaxVisionEncoderDecoderModel
         from .models.vision_text_dual_encoder import FlaxVisionTextDualEncoderModel
-        from .models.vit import FlaxViTForImageClassification, FlaxViTModel, FlaxViTPreTrainedModel
+        from .models.vit import (
+            FlaxViTForImageClassification,
+            FlaxViTModel,
+            FlaxViTPreTrainedModel,
+        )
         from .models.wav2vec2 import (
             FlaxWav2Vec2ForCTC,
             FlaxWav2Vec2ForPreTraining,
@@ -7805,7 +8780,11 @@
             FlaxWhisperModel,
             FlaxWhisperPreTrainedModel,
         )
-        from .models.xglm import FlaxXGLMForCausalLM, FlaxXGLMModel, FlaxXGLMPreTrainedModel
+        from .models.xglm import (
+            FlaxXGLMForCausalLM,
+            FlaxXGLMModel,
+            FlaxXGLMPreTrainedModel,
+        )
         from .models.xlm_roberta import (
             FLAX_XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
             FlaxXLMRobertaForCausalLM,
diff --git a/src/transformers/activations.py b/src/transformers/activations.py
index be26825f4bad15..2355fb5fed678d 100644
--- a/src/transformers/activations.py
+++ b/src/transformers/activations.py
@@ -206,6 +206,7 @@ def __getitem__(self, key):
     "gelu_pytorch_tanh": PytorchGELUTanh,
     "gelu_accurate": AccurateGELUActivation,
     "laplace": LaplaceActivation,
+    "leaky_relu": nn.LeakyReLU,
     "linear": LinearActivation,
     "mish": MishActivation,
     "quick_gelu": QuickGELUActivation,
diff --git a/src/transformers/cache_utils.py b/src/transformers/cache_utils.py
new file mode 100644
index 00000000000000..b298a7bdd0f5d6
--- /dev/null
+++ b/src/transformers/cache_utils.py
@@ -0,0 +1,322 @@
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+
+
+class Cache:
+    """
+    Base, abstract class for all caches. The actual data structure is specific to each subclass.
+    """
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. These are specific to each subclass and allow new types of
+                cache to be created.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        raise NotImplementedError("Make sure to implement `update` in a subclass.")
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.")
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states, if there is any."""
+        raise NotImplementedError("Make sure to implement `get_max_length` in a subclass.")
+
+    def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int:
+        """Given the sequence length of the new inputs, returns the usable length of the cache."""
+        # Cache without size limit -> all cache is usable
+        # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache
+        #   length, we will need to evict part of the cache (and thus not all cache is usable)
+        max_length = self.get_max_length()
+        previous_seq_length = self.get_seq_length(layer_idx)
+        if max_length is not None and previous_seq_length + new_seq_length > max_length:
+            return max_length - new_seq_length
+        return previous_seq_length
+
+
+class DynamicCache(Cache):
+    """
+    A cache that grows dynamically as more tokens are generated. This is the default for generative models.
+
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+    """
+
+    def __init__(self) -> None:
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        self.seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        """
+        Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the
+        sequence length.
+        """
+        if layer_idx < len(self):
+            return (self.key_cache[layer_idx], self.value_cache[layer_idx])
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    def __iter__(self):
+        """
+        Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over
+        keys and values
+        """
+        for layer_idx in range(len(self)):
+            yield (self.key_cache[layer_idx], self.value_cache[layer_idx])
+
+    def __len__(self):
+        """
+        Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds
+        to the number of layers in the model.
+        """
+        return len(self.key_cache)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self.seen_tokens += key_states.shape[-2]
+
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states. DynamicCache does not have a maximum length."""
+        return None
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+    def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]:
+        """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format."""
+        legacy_cache = ()
+        for layer_idx in range(len(self)):
+            legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),)
+        return legacy_cache
+
+    @classmethod
+    def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache":
+        """Converts a cache in the legacy cache format into an equivalent `DynamicCache`."""
+        cache = cls()
+        if past_key_values is not None:
+            for layer_idx in range(len(past_key_values)):
+                key_states, value_states = past_key_values[layer_idx]
+                cache.update(key_states, value_states, layer_idx)
+        return cache
+
+
+class SinkCache(Cache):
+    """
+    A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to
+    generate beyond the length of its context window, without losing fluency in the conversation. As it discards past
+    tokens, the model will lose the ability to generate tokens that depend on the context that was discarded.
+
+    It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is
+    `[batch_size, num_heads, seq_len, head_dim]`.
+
+    Parameters:
+        window_length (`int`):
+            The length of the context window.
+        num_sink_tokens (`int`):
+            The number of sink tokens. See the original paper for more information.
+    """
+
+    def __init__(self, window_length: int, num_sink_tokens: int) -> None:
+        self.key_cache: List[torch.Tensor] = []
+        self.value_cache: List[torch.Tensor] = []
+        self.window_length = window_length
+        self.num_sink_tokens = num_sink_tokens
+        self.cos_sin_cache = {}
+        self.seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
+
+    @staticmethod
+    def _rotate_half(x):
+        x1 = x[..., : x.shape[-1] // 2]
+        x2 = x[..., x.shape[-1] // 2 :]
+        return torch.cat((-x2, x1), dim=-1)
+
+    def _apply_key_rotary_pos_emb(
+        self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+    ) -> torch.Tensor:
+        rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin)
+        return rotated_key_states
+
+    def _get_rerotation_cos_sin(
+        self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if key_states.shape[-2] not in self.cos_sin_cache:
+            # Upcast to float32 temporarily for better accuracy
+            cos = cos.to(torch.float32)
+            sin = sin.to(torch.float32)
+
+            # Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence
+            original_cos = cos[self.num_sink_tokens + key_states.shape[-2] :]
+            shifted_cos = cos[self.num_sink_tokens : -key_states.shape[-2]]
+            original_sin = sin[self.num_sink_tokens + key_states.shape[-2] :]
+            shifted_sin = sin[self.num_sink_tokens : -key_states.shape[-2]]
+            rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin
+            rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin
+
+            self.cos_sin_cache[key_states.shape[-2]] = (
+                rerotation_cos.to(key_states.dtype).unsqueeze(0),
+                rerotation_sin.to(key_states.dtype).unsqueeze(0),
+            )
+        return self.cos_sin_cache[key_states.shape[-2]]
+
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+    def get_max_length(self) -> Optional[int]:
+        """Returns the maximum sequence length of the cached states."""
+        return self.window_length
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`,
+                `cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the
+                rotation as the tokens are shifted.
+
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models
+        # with partially rotated position embeddings, like Phi or Persimmon.
+        sin = cache_kwargs.get("sin")
+        cos = cache_kwargs.get("cos")
+        partial_rotation_size = cache_kwargs.get("partial_rotation_size")
+        using_rope = cos is not None and sin is not None
+
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self.seen_tokens += key_states.shape[-2]
+
+        # [bsz, num_heads, seq_len, head_dim]
+        if len(self.key_cache) <= layer_idx:
+            # Empty cache
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+
+        elif key_states.shape[-2] + self.get_seq_length(layer_idx) < self.window_length:
+            # Growing cache
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+
+        else:
+            # Shifting cache
+            keys_to_keep = self.key_cache[layer_idx][
+                :, :, -self.window_length + self.num_sink_tokens + key_states.shape[-2] :
+            ]
+
+            # On RoPE models, we need to recompute the Key rotation as the tokens are shifted
+            if using_rope:
+                rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin(
+                    key_states, cos[: self.window_length], sin[: self.window_length]
+                )
+                if partial_rotation_size is not None:
+                    keys_to_keep, keys_pass = (
+                        keys_to_keep[..., :partial_rotation_size],
+                        keys_to_keep[..., partial_rotation_size:],
+                    )
+                keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin)
+                if partial_rotation_size is not None:
+                    keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1)
+
+            # Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens
+            sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens]
+            self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep, key_states], dim=-2)
+
+            sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens]
+            values_to_keep = self.value_cache[layer_idx][
+                :, :, -self.window_length + self.num_sink_tokens + value_states.shape[-2] :
+            ]
+            self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep, value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
diff --git a/src/transformers/commands/convert.py b/src/transformers/commands/convert.py
index b46e14f5a67320..77df8ea1106439 100644
--- a/src/transformers/commands/convert.py
+++ b/src/transformers/commands/convert.py
@@ -123,23 +123,6 @@ def run(self):
             )
 
             convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
-        elif self._model_type == "transfo_xl":
-            try:
-                from ..models.transfo_xl.convert_transfo_xl_original_tf_checkpoint_to_pytorch import (
-                    convert_transfo_xl_checkpoint_to_pytorch,
-                )
-            except ImportError:
-                raise ImportError(IMPORT_ERROR_MESSAGE)
-
-            if "ckpt" in self._tf_checkpoint.lower():
-                TF_CHECKPOINT = self._tf_checkpoint
-                TF_DATASET_FILE = ""
-            else:
-                TF_DATASET_FILE = self._tf_checkpoint
-                TF_CHECKPOINT = ""
-            convert_transfo_xl_checkpoint_to_pytorch(
-                TF_CHECKPOINT, self._config, self._pytorch_dump_output, TF_DATASET_FILE
-            )
         elif self._model_type == "gpt2":
             try:
                 from ..models.gpt2.convert_gpt2_original_tf_checkpoint_to_pytorch import (
@@ -179,6 +162,4 @@ def run(self):
 
             convert_rembert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output)
         else:
-            raise ValueError(
-                "--model_type should be selected in the list [bert, gpt, gpt2, t5, transfo_xl, xlnet, xlm, lxmert]"
-            )
+            raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, t5, xlnet, xlm, lxmert]")
diff --git a/src/transformers/configuration_utils.py b/src/transformers/configuration_utils.py
index 22ea0abbd60104..5c419ee0fc7cf3 100755
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@@ -236,6 +236,8 @@ class PretrainedConfig(PushToHubMixin):
 
             This attribute is currently not being used during model loading time, but this may change in the future
             versions. But we can already start preparing for the future by saving the dtype with save_pretrained.
+        attn_implementation (`str`, *optional*):
+            The attention implementation to use in the model. Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (attention using [`torch.nn.functional.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (attention using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation.
 
         > TensorFlow specific parameters
 
@@ -374,6 +376,9 @@ def __init__(self, **kwargs):
         # Config hash
         self._commit_hash = kwargs.pop("_commit_hash", None)
 
+        # Attention implementation to use, if relevant.
+        self._attn_implementation_internal = kwargs.pop("attn_implementation", None)
+
         # Drop the transformers version info
         self.transformers_version = kwargs.pop("transformers_version", None)
 
@@ -422,6 +427,22 @@ def num_labels(self, num_labels: int):
             self.id2label = {i: f"LABEL_{i}" for i in range(num_labels)}
             self.label2id = dict(zip(self.id2label.values(), self.id2label.keys()))
 
+    @property
+    def _attn_implementation(self):
+        # This property is made private for now (as it cannot be changed and a PreTrainedModel.use_attn_implementation method needs to be implemented.)
+        if hasattr(self, "_attn_implementation_internal"):
+            if self._attn_implementation_internal is None:
+                # `config.attn_implementation` should never be None, for backward compatibility.
+                return "eager"
+            else:
+                return self._attn_implementation_internal
+        else:
+            return "eager"
+
+    @_attn_implementation.setter
+    def _attn_implementation(self, value):
+        self._attn_implementation_internal = value
+
     def save_pretrained(self, save_directory: Union[str, os.PathLike], push_to_hub: bool = False, **kwargs):
         """
         Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the
@@ -747,6 +768,9 @@ def from_dict(cls, config_dict: Dict[str, Any], **kwargs) -> "PretrainedConfig":
         if "_commit_hash" in kwargs and "_commit_hash" in config_dict:
             kwargs["_commit_hash"] = config_dict["_commit_hash"]
 
+        # We remove it from kwargs so that it does not appear in `return_unused_kwargs`.
+        config_dict["attn_implementation"] = kwargs.pop("attn_implementation", None)
+
         config = cls(**config_dict)
 
         if hasattr(config, "pruned_heads"):
@@ -861,8 +885,8 @@ def to_diff_dict(self) -> Dict[str, Any]:
 
         self.dict_torch_dtype_to_str(serializable_config_dict)
 
-        if "_flash_attn_2_enabled" in serializable_config_dict:
-            del serializable_config_dict["_flash_attn_2_enabled"]
+        if "_attn_implementation_internal" in serializable_config_dict:
+            del serializable_config_dict["_attn_implementation_internal"]
 
         return serializable_config_dict
 
@@ -880,8 +904,8 @@ def to_dict(self) -> Dict[str, Any]:
             del output["_auto_class"]
         if "_commit_hash" in output:
             del output["_commit_hash"]
-        if "_flash_attn_2_enabled" in output:
-            del output["_flash_attn_2_enabled"]
+        if "_attn_implementation_internal" in output:
+            del output["_attn_implementation_internal"]
 
         # Transformers version when serializing the model
         output["transformers_version"] = __version__
diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py
index 93e21ab2d3e561..fcace1826ac453 100644
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@@ -2,8 +2,8 @@
 # 1. modify the `_deps` dict in setup.py
 # 2. run `make deps_table_update``
 deps = {
-    "Pillow": "Pillow<10.0.0",
-    "accelerate": "accelerate>=0.20.3",
+    "Pillow": "Pillow>=10.0.1,<=15.0",
+    "accelerate": "accelerate>=0.21.0",
     "av": "av==9.2.0",
     "beautifulsoup4": "beautifulsoup4",
     "codecarbon": "codecarbon==1.2.0",
@@ -55,13 +55,13 @@
     "pytest-timeout": "pytest-timeout",
     "pytest-xdist": "pytest-xdist",
     "python": "python>=3.8.0",
-    "ray[tune]": "ray[tune]",
+    "ray[tune]": "ray[tune]>=2.7.0",
     "regex": "regex!=2019.12.17",
     "requests": "requests",
     "rhoknp": "rhoknp>=1.1.0,<1.3.1",
     "rjieba": "rjieba",
     "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff": "ruff>=0.1.5,<=0.2",
+    "ruff": "ruff==0.1.5",
     "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
     "sacremoses": "sacremoses",
     "safetensors": "safetensors>=0.3.1",
diff --git a/src/transformers/dynamic_module_utils.py b/src/transformers/dynamic_module_utils.py
index ebf772a959e9d1..7cdc0ad93d5268 100644
--- a/src/transformers/dynamic_module_utils.py
+++ b/src/transformers/dynamic_module_utils.py
@@ -25,6 +25,8 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union
 
+from huggingface_hub import try_to_load_from_cache
+
 from .utils import (
     HF_MODULES_CACHE,
     TRANSFORMERS_DYNAMIC_MODULE_NAME,
@@ -32,7 +34,6 @@
     extract_commit_hash,
     is_offline_mode,
     logging,
-    try_to_load_from_cache,
 )
 
 
diff --git a/src/transformers/generation/beam_search.py b/src/transformers/generation/beam_search.py
index a46859c88cbd99..5e73862e163ddf 100644
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@@ -151,7 +151,7 @@ class BeamSearchScorer(BeamScorer):
             beam search algorithm).
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
-            [`~transformer.BeamSearchScorer.finalize`].
+            [`~transformers.BeamSearchScorer.finalize`].
         num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
@@ -224,8 +224,8 @@ def process(
         group_index: Optional[int] = 0,
         decoder_prompt_len: Optional[int] = 0,
     ) -> Dict[str, torch.Tensor]:
-        # add up to the length which the next_scores is calculated on
-        cur_len = input_ids.shape[-1] - decoder_prompt_len + 1
+        # add up to the length which the next_scores is calculated on (including decoder prompt)
+        cur_len = input_ids.shape[-1] + 1
         batch_size = len(self._beam_hyps) // self.num_beam_groups
 
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
@@ -279,15 +279,11 @@ def process(
                     else:
                         beam_index = None
 
-                    # skip the corner case where the very first generated token is eos_token
-                    if decoder_prompt_len == input_ids.shape[-1]:
-                        continue
-
                     self._beam_hyps[batch_group_idx].add(
                         input_ids[batch_beam_idx].clone(),
                         next_score.item(),
                         beam_indices=beam_index,
-                        decoder_prompt_len=decoder_prompt_len,
+                        generated_len=cur_len - decoder_prompt_len,
                     )
                 else:
                     # add next predicted token since it is not eos_token
@@ -308,7 +304,7 @@ def process(
 
             # Check if we are done so that we can save a pad step if all(done)
             self._done[batch_group_idx] = self._done[batch_group_idx] or self._beam_hyps[batch_group_idx].is_done(
-                next_scores[batch_idx].max().item(), cur_len
+                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
             )
 
         return UserDict(
@@ -348,7 +344,8 @@ def finalize(
                 final_score = final_beam_scores[batch_beam_idx].item()
                 final_tokens = input_ids[batch_beam_idx]
                 beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
-                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, decoder_prompt_len=decoder_prompt_len)
+                generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
 
         # select the best hypotheses
         sent_lengths = input_ids.new(batch_size * self.num_beam_hyps_to_keep)
@@ -444,7 +441,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
             beam search algorithm).
         num_beam_hyps_to_keep (`int`, *optional*, defaults to 1):
             The number of beam hypotheses that shall be returned upon calling
-            [`~transformer.BeamSearchScorer.finalize`].
+            [`~transformers.BeamSearchScorer.finalize`].
         num_beam_groups (`int`, *optional*, defaults to 1):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
             See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
@@ -560,8 +557,8 @@ def process(
                 indicating to which beam the next tokens shall be added.
         """
 
-        # add up to the length which the next_scores is calculated on
-        cur_len = input_ids.shape[-1] - decoder_prompt_len + 1
+        # add up to the length which the next_scores is calculated on (including decoder prompt)
+        cur_len = input_ids.shape[-1] + 1
         batch_size = len(self._beam_hyps)
         if not (batch_size == (input_ids.shape[0] // self.group_size)):
             if self.num_beam_groups > 1:
@@ -617,16 +614,11 @@ def process(
                         else:
                             beam_index = None
 
-                        # skip the corner case where the only constraint token is
-                        # eos_token and the very first generated token is eos_token
-                        if decoder_prompt_len == input_ids.shape[-1]:
-                            continue
-
                         beam_hyp.add(
                             input_ids[batch_beam_idx].clone(),
                             next_score.item(),
                             beam_indices=beam_index,
-                            decoder_prompt_len=decoder_prompt_len,
+                            generated_len=cur_len - decoder_prompt_len,
                         )
                 else:
                     # add next predicted token since it is not eos_token
@@ -660,7 +652,7 @@ def process(
 
             # Check if we are done so that we can save a pad step if all(done)
             self._done[batch_idx] = self._done[batch_idx] or beam_hyp.is_done(
-                next_scores[batch_idx].max().item(), cur_len
+                next_scores[batch_idx].max().item(), cur_len, decoder_prompt_len
             )
 
         return UserDict(
@@ -846,9 +838,8 @@ def finalize(
                 completes_constraint = self.check_completes_constraints(final_tokens.cpu().tolist())
                 if completes_constraint:
                     beam_index = beam_indices[batch_beam_idx] if beam_indices is not None else None
-                    beam_hyp.add(
-                        final_tokens, final_score, beam_indices=beam_index, decoder_prompt_len=decoder_prompt_len
-                    )
+                    generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
                     ids_collect.append(beam_id)
 
             # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
@@ -859,7 +850,8 @@ def finalize(
                         batch_beam_idx = batch_idx * self.num_beams + beam_id
                         final_score = final_beam_scores[batch_beam_idx].item()
                         final_tokens = input_ids[batch_beam_idx]
-                        beam_hyp.add(final_tokens, final_score, decoder_prompt_len=decoder_prompt_len)
+                        generated_len = final_tokens.shape[-1] - decoder_prompt_len
+                        beam_hyp.add(final_tokens, final_score, generated_len=generated_len)
                     if len(ids_collect) >= self.num_beam_hyps_to_keep:
                         break
 
@@ -956,12 +948,17 @@ def add(
         hyp: torch.LongTensor,
         sum_logprobs: float,
         beam_indices: Optional[torch.LongTensor] = None,
-        decoder_prompt_len: Optional[int] = 0,
+        generated_len: Optional[int] = None,
     ):
         """
         Add a new hypothesis to the list.
         """
-        score = sum_logprobs / ((hyp.shape[-1] - decoder_prompt_len) ** self.length_penalty)
+        if generated_len is not None:
+            score = sum_logprobs / (generated_len**self.length_penalty)
+        # This 'else' case exists for retrocompatibility
+        else:
+            score = sum_logprobs / (hyp.shape[-1] ** self.length_penalty)
+
         if len(self) < self.num_beams or score > self.worst_score:
             self.beams.append((score, hyp, beam_indices))
             if len(self) > self.num_beams:
@@ -971,7 +968,7 @@ def add(
             else:
                 self.worst_score = min(score, self.worst_score)
 
-    def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
+    def is_done(self, best_sum_logprobs: float, cur_len: int, decoder_prompt_len: Optional[int] = 0) -> bool:
         """
         If there are enough hypotheses and that none of the hypotheses being generated can become better than the worst
         one in the heap, then we are done with this sentence.
@@ -987,7 +984,7 @@ def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
         #  when `length_penalty` is positive. See the discussion below for more details.
         # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
         elif self.early_stopping is False:
-            highest_attainable_score = best_sum_logprobs / cur_len**self.length_penalty
+            highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
             ret = self.worst_score >= highest_attainable_score
             return ret
         # `"never"`: compute the best possible score, depending on the signal of `length_penalty`
@@ -996,9 +993,13 @@ def is_done(self, best_sum_logprobs: float, cur_len: int) -> bool:
             # abs(`highest_attainable_score`) is obtained -> `highest_attainable_score` is negative, hence we obtain
             # its max this way
             if self.length_penalty > 0.0:
-                highest_attainable_score = best_sum_logprobs / self.max_length**self.length_penalty
+                if self.max_length <= decoder_prompt_len:
+                    raise ValueError("max_length is not larger than decoder prompt length")
+                highest_attainable_score = (
+                    best_sum_logprobs / (self.max_length - decoder_prompt_len) ** self.length_penalty
+                )
             # the opposite logic applies here (max `highest_attainable_score` from `cur_len`)
             else:
-                highest_attainable_score = best_sum_logprobs / cur_len**self.length_penalty
+                highest_attainable_score = best_sum_logprobs / (cur_len - decoder_prompt_len) ** self.length_penalty
             ret = self.worst_score >= highest_attainable_score
             return ret
diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py
new file mode 100644
index 00000000000000..7cceac3364afad
--- /dev/null
+++ b/src/transformers/generation/candidate_generator.py
@@ -0,0 +1,330 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import warnings
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Union
+
+import torch
+
+
+if TYPE_CHECKING:
+    from ..modeling_utils import PreTrainedModel
+    from .logits_process import LogitsProcessorList
+
+
+class CandidateGenerator:
+    """Abstract base class for all candidate generators that can be applied during assisted generation."""
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(num_candidates, candidate_length)`: The candidate sequences to be assessed by
+            the model.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can call `get_candidates`."
+        )
+
+    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
+        """
+        Updates the candidate generation strategy based on the outcomes.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, candidate_length, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            num_matches (`int`):
+                The number of matches between the candidate sequences and the model predictions.
+        """
+        raise NotImplementedError(
+            f"{self.__class__} is an abstract class. Only classes inheriting this class can call "
+            "`update_candidate_strategy`."
+        )
+
+
+class AssistedCandidateGenerator(CandidateGenerator):
+    """
+    `CandidateGenerator` class to be used for assisted generation. This class generates candidates through the use of
+    a smaller model. Read the following blog post for more information: https://huggingface.co/blog/assisted-generation
+
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+        assistant_model (`PreTrainedModel`):
+            The model to be used for generating candidates. This model should be smaller than the main model.
+        logits_processor (`LogitsProcessorList`):
+            An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+            used to modify the prediction scores of the language modeling head applied at each generation step.
+        model_kwargs (`Dict`):
+            The keyword arguments that will be passed to the main model, and are used as base inputs for the assistant
+            model as well.
+        inputs_tensor (`torch.Tensor`, *optional*):
+            The model input tensor. In encoder-decoder models, this is the encoder input.
+        eos_token_id (`Union[int, List[int]]`, *optional*):
+            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+    """
+
+    def __init__(
+        self,
+        input_ids: torch.LongTensor,
+        assistant_model: "PreTrainedModel",
+        logits_processor: "LogitsProcessorList",
+        model_kwargs: Dict,
+        inputs_tensor: Optional[torch.Tensor] = None,
+        eos_token_id: Optional[Union[int, List[int]]] = None,
+    ):
+        self.assistant_model = assistant_model
+
+        # Prepare the number of candidate tokens
+        if hasattr(assistant_model, "num_assistant_tokens"):
+            warnings.warn(
+                "Setting `num_assistant_tokens` via `assistant_model.num_assistant_tokens` is deprecated and will be "
+                "removed in v4.37. Make sure to set `num_assistant_tokens` via the generation_config instead.",
+                FutureWarning,
+            )
+            self.num_assistant_tokens = assistant_model.num_assistant_tokens
+        else:
+            self.num_assistant_tokens = assistant_model.generation_config.num_assistant_tokens
+
+        # Prepare the kwargs for the assistant model
+        assistant_kwargs = {}
+        for key, value in model_kwargs.items():  # deepcopy crashes if we attempt to copy encoder outputs with grads
+            if key not in ("encoder_outputs", "assistant_encoder_outputs"):
+                assistant_kwargs[key] = value.detach() if isinstance(value, torch.Tensor) else copy.deepcopy(value)
+
+        if "assistant_encoder_outputs" in model_kwargs:
+            assistant_kwargs["encoder_outputs"] = model_kwargs["assistant_encoder_outputs"]
+        elif assistant_model.config.is_encoder_decoder:
+            inputs_tensor, model_input_name, assistant_kwargs = assistant_model._prepare_model_inputs(
+                inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_kwargs
+            )
+            assistant_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, assistant_kwargs, model_input_name
+            )
+        elif "encoder_outputs" in model_kwargs:
+            assistant_kwargs["encoder_outputs"] = model_kwargs["encoder_outputs"]
+        self.assistant_kwargs = assistant_kwargs
+
+        # Prepare assistant model's keys of inputs
+        if assistant_model.config.is_encoder_decoder:
+            # both are encoder-decoder
+            self.input_ids_key = "decoder_input_ids"
+            self.attention_key = "decoder_attention_mask"
+        elif "encoder_outputs" in assistant_kwargs:
+            # special case for encoder-decoder with decoder-only assistant (like DistilWhisper)
+            self.input_ids_key = "input_ids"
+            self.attention_key = "attention_mask"
+            self.assistant_kwargs["attention_mask"] = self.assistant_kwargs.get(
+                "decoder_attention_mask",
+                torch.ones((input_ids.shape[0], 1), device=input_ids.device, dtype=torch.long),
+            )
+        else:
+            # both are decoder-only
+            self.input_ids_key = "input_ids"
+            self.attention_key = "attention_mask"
+
+        # Prepare other attributes
+        if isinstance(eos_token_id, int):
+            eos_token_id = [eos_token_id]
+        self.eos_token_id_tensor = (
+            torch.tensor(eos_token_id).to(input_ids.device) if eos_token_id is not None else None
+        )
+        self.logits_processor = logits_processor
+
+    def get_candidates(self, input_ids: torch.LongTensor) -> torch.LongTensor:
+        """
+        Fetches the candidates to be tried for the current input.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+
+        Return:
+            `torch.LongTensor` of shape `(num_candidates, candidate_length)`: The candidate sequences to be tried.
+        """
+        # 1. If it is not the first round of candidate generation, prepare the inputs based on the input_ids length
+        # (which implicitly contains the number of accepted candidates from the previous round)
+        has_past_key_values = self.assistant_kwargs.get("past_key_values", None) is not None
+        if has_past_key_values:
+            new_cur_len = input_ids.shape[-1]
+
+            new_cache_size = new_cur_len - 1
+            self.assistant_kwargs["past_key_values"] = _crop_past_key_values(
+                self.assistant_model, self.assistant_kwargs["past_key_values"], new_cache_size - 1
+            )  # the assistant does not have the token after the last match, hence the -1
+
+            self.assistant_kwargs = _prepare_attention_mask(
+                self.assistant_kwargs, new_cur_len, self.assistant_model.config.is_encoder_decoder
+            )
+            self.assistant_kwargs = _prepare_token_type_ids(self.assistant_kwargs, new_cur_len)
+
+        # 2. Forecast next N tokens using the assistant model. This `for` block can be replaced with a `.generate()`
+        # call if we decide to add `past_key_values` as a possible output of generate, as we need access to the
+        # assistant cache to secure strong speedups.
+        candidate_input_ids = input_ids
+        for _ in range(int(self.num_assistant_tokens)):
+            # 2.1 prepare assistant model inputs
+            assistant_inputs = self.assistant_model.prepare_inputs_for_generation(
+                candidate_input_ids,
+                **self.assistant_kwargs,
+            )
+
+            # 2.2. check if the input ids length is correct
+            has_past_key_values = assistant_inputs.get("past_key_values", None) is not None
+            if has_past_key_values and assistant_inputs[self.input_ids_key].shape[-1] not in (1, 2):
+                raise ValueError("The length of the input ids in assistant inputs should be 1 or 2")
+
+            # 2.3. use the assistant model to obtain the next candidate logits
+            assistant_model_outputs = self.assistant_model(**assistant_inputs)
+
+            # 2.4. greedily select the next candidate token
+            if len(self.logits_processor) > 0:
+                assistant_model_outputs.logits[:, -1, :] = self.logits_processor(
+                    candidate_input_ids, assistant_model_outputs.logits[:, -1, :]
+                )
+            new_token = assistant_model_outputs.logits[:, -1, :].argmax(dim=-1)
+            candidate_input_ids = torch.cat((candidate_input_ids, new_token[:, None]), dim=-1)
+
+            # 2.5. update assistant model inputs
+            if self.assistant_kwargs.get(self.attention_key, None) is not None:
+                mask = self.assistant_kwargs[self.attention_key]
+                self.assistant_kwargs[self.attention_key] = torch.cat(
+                    [mask, mask.new_ones((mask.shape[0], 1))], dim=-1
+                )
+            self.assistant_kwargs["past_key_values"] = assistant_model_outputs.past_key_values
+
+            # 2.6. stop assistant generation on EOS
+            if self.eos_token_id_tensor is not None:
+                last_assistant_token_is_eos = new_token.tile(self.eos_token_id_tensor.shape[0], 1)
+                last_assistant_token_is_eos = (
+                    ~last_assistant_token_is_eos.ne(self.eos_token_id_tensor.unsqueeze(1)).prod(dim=0).bool()
+                )
+                if last_assistant_token_is_eos:
+                    break
+
+        return candidate_input_ids
+
+    def update_candidate_strategy(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, num_matches: int):
+        """
+        Updates the candidate generation strategy based on the outcomes.
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. [What are input IDs?](../glossary#input-ids)
+            scores (`torch.FloatTensor` of shape `(batch_size, candidate_length, config.vocab_size)`):
+                Prediction scores of a language modeling head. These can be logits for each vocabulary when not using
+                beam search or log softmax for each vocabulary token when using beam search
+            num_matches (`int`):
+                The number of matches between the candidate sequences and the model predictions.
+        """
+        # Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
+        # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
+        # cost of forecasting incorrect assistant tokens.
+        if self.assistant_model.generation_config.num_assistant_tokens_schedule == "heuristic":
+            if num_matches == int(self.num_assistant_tokens):
+                self.num_assistant_tokens += 2.0
+            else:
+                self.num_assistant_tokens = max(1.0, self.num_assistant_tokens - 1.0)
+
+
+def _crop_past_key_values(model, past_key_values, maximum_length):
+    """Crops the past key values up to a certain maximum length."""
+    new_past = []
+    if model.config.is_encoder_decoder:
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length, :],
+                    past_key_values[idx][1][:, :, :maximum_length, :],
+                    past_key_values[idx][2],
+                    past_key_values[idx][3],
+                )
+            )
+        past_key_values = tuple(new_past)
+    # bloom is special
+    elif "bloom" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "bloom" in model.config.architectures[0].lower()
+    ):
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length],
+                    past_key_values[idx][1][:, :maximum_length, :],
+                )
+            )
+        past_key_values = tuple(new_past)
+    # gptbigcode is too
+    elif "gptbigcode" in model.__class__.__name__.lower() or (
+        model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower()
+    ):
+        if model.config.multi_query:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :maximum_length, :]
+        else:
+            for idx in range(len(past_key_values)):
+                past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :]
+    else:
+        for idx in range(len(past_key_values)):
+            new_past.append(
+                (
+                    past_key_values[idx][0][:, :, :maximum_length, :],
+                    past_key_values[idx][1][:, :, :maximum_length, :],
+                )
+            )
+        past_key_values = tuple(new_past)
+    return past_key_values
+
+
+def _prepare_attention_mask(model_kwargs: Dict[str, Any], new_length: int, is_encoder_decoder: bool) -> Dict[str, Any]:
+    """Expands or crops the model's mask for decoding purposes, to the defined length"""
+
+    mask_key = "decoder_attention_mask" if is_encoder_decoder else "attention_mask"
+    if mask_key not in model_kwargs:
+        return model_kwargs
+
+    mask = model_kwargs[mask_key]
+    mask_length_diff = new_length - mask.shape[1]
+
+    if mask_length_diff < 0:
+        model_kwargs[mask_key] = mask[:, :mask_length_diff]
+    elif mask_length_diff > 0:
+        model_kwargs[mask_key] = torch.cat([mask, mask.new_ones((mask.shape[0], mask_length_diff))], dim=-1)
+    return model_kwargs
+
+
+def _prepare_token_type_ids(model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
+    """Expands or crops the model's token_type_ids for decoding purposes, to the defined length"""
+    if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
+        return model_kwargs
+
+    token_type_ids = model_kwargs["token_type_ids"]
+    final_token_type = token_type_ids[:, -1].unsqueeze(-1)
+    type_length_diff = new_length - token_type_ids.shape[1]
+
+    if type_length_diff < 0:
+        token_type_ids = token_type_ids[:, :type_length_diff]
+    elif type_length_diff > 0:
+        token_type_copies = final_token_type.repeat(1, type_length_diff)
+        model_kwargs["token_type_ids"] = torch.cat([model_kwargs["token_type_ids"], token_type_copies], dim=-1)
+    return model_kwargs
diff --git a/src/transformers/generation/configuration_utils.py b/src/transformers/generation/configuration_utils.py
index 6b0b434ec726b0..4818ca8d97b7f1 100644
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@@ -63,6 +63,14 @@ class GenerationConfig(PushToHubMixin):
     You do not need to call any of the above methods directly. Pass custom parameter values to '.generate()'. To learn
     more about decoding strategies refer to the [text generation strategies guide](../generation_strategies).
 
+    <Tip>
+
+    A large number of these flags control the logits or the stopping criteria of the generation. Make sure you check
+    the [generate-related classes](https://huggingface.co/docs/transformers/internal/generation_utils) for a full
+    description of the possible manipulations, as well as examples of their usage.
+
+    </Tip>
+
     Arg:
         > Parameters that control the length of the output
 
@@ -410,7 +418,7 @@ def validate(self, is_init=False):
 
         # 2. detect beam-only parameterization when not in beam mode
         if self.num_beams is None:
-            logging.warning("`num_beams` is set to None - defaulting to 1.", UserWarning)
+            warnings.warn("`num_beams` is set to None - defaulting to 1.", UserWarning)
             self.num_beams = 1
 
         if self.num_beams == 1:
@@ -497,6 +505,24 @@ def validate(self, is_init=False):
                     f"({self.num_beams})."
                 )
 
+        # 5. check common issue: passing `generate` arguments inside the generation config
+        generate_arguments = (
+            "logits_processor",
+            "stopping_criteria",
+            "prefix_allowed_tokens_fn",
+            "synced_gpus",
+            "assistant_model",
+            "streamer",
+            "negative_prompt_ids",
+            "negative_prompt_attention_mask",
+        )
+        for arg in generate_arguments:
+            if hasattr(self, arg):
+                raise ValueError(
+                    f"Argument `{arg}` is not a valid argument of `GenerationConfig`. It should be passed to "
+                    "`generate()` (or a pipeline) directly."
+                )
+
     def save_pretrained(
         self,
         save_directory: Union[str, os.PathLike],
diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py
index c83afdc0a580eb..4fce8970f8647c 100644
--- a/src/transformers/generation/flax_utils.py
+++ b/src/transformers/generation/flax_utils.py
@@ -809,6 +809,9 @@ def gather_fn(tensor):
         pad_token_id = jnp.array(pad_token_id, dtype=jnp.int32)
         cur_len = jnp.array(cur_len)
 
+        # record the prompt length of decoder
+        decoder_prompt_len = input_ids.shape[-1]
+
         # per batch,beam-item holding current token in loop.
         sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
         running_sequences = jnp.full((batch_size, num_beams, max_length), pad_token_id, dtype=jnp.int32)
@@ -861,9 +864,13 @@ def beam_search_cond_fn(state):
             # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
             #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
             if early_stopping == "never" and length_penalty > 0.0:
-                best_running_score = state.running_scores[:, :1] / (max_length**length_penalty)
+                best_running_score = state.running_scores[:, :1] / (
+                    (max_length - decoder_prompt_len) ** length_penalty
+                )
             else:
-                best_running_score = state.running_scores[:, :1] / (state.cur_len**length_penalty)
+                best_running_score = state.running_scores[:, :1] / (
+                    (state.cur_len - decoder_prompt_len) ** length_penalty
+                )
             worst_finished_score = jnp.where(
                 state.is_sent_finished, jnp.min(state.scores, axis=1, keepdims=True), np.array(-1.0e7)
             )
@@ -953,7 +960,7 @@ def beam_search_body_fn(state, input_ids_length=1):
             # - add length penalty
             # - make sure no scores can be added anymore if beam is full
             # - make sure still running sequences cannot be chosen as finalized beam
-            topk_log_probs = topk_log_probs / (state.cur_len**length_penalty)
+            topk_log_probs = topk_log_probs / ((state.cur_len + 1 - decoder_prompt_len) ** length_penalty)
             beams_in_batch_are_full = jnp.broadcast_to(
                 state.is_sent_finished.all(axis=-1, keepdims=True), did_topk_just_finished.shape
             ) & (early_stopping is True)
@@ -990,9 +997,10 @@ def beam_search_body_fn(state, input_ids_length=1):
                 model_kwargs=next_model_kwargs,
             )
 
-        # The very first prompt often has sequence length > 1, so run outside of `lax.while_loop` to comply with TPU
-        if input_ids.shape[-1] > 1:
-            state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state)
+        # Always run first iteration outside of `lax.while_loop` to avoid calling `beam_search_cond_fn`
+        # when `state.cur_len` equals `decoder_prompt_len`. This also helps to comply with TPU when
+        # the very first prompt has sequence length > 1.
+        state = partial(beam_search_body_fn, input_ids_length=input_ids.shape[-1])(state)
 
         if not trace:
             state = self._run_loop_in_debug(beam_search_cond_fn, beam_search_body_fn, state)
diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py
index 3d1801b248041a..4b9b91cd8068d9 100644
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@@ -100,13 +100,39 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwa
 
 class MinLengthLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0.
+    [`LogitsProcessor`] enforcing a min-length by setting EOS probability to 0. Note that, for decoder-only models
+    like most LLMs, the length includes the prompt.
 
     Args:
         min_length (`int`):
             The minimum length below which the score of `eos_token_id` is set to `-float("Inf")`.
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("A number:", return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
+
+    >>> # setting `min_length` to a value smaller than the uncontrolled output length has no impact
+    >>> gen_out = model.generate(**inputs, min_length=3)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
+
+    >>> # setting a larger `min_length` will force the model to generate beyond its natural ending point, which is not
+    >>> # necessarily incorrect
+    >>> gen_out = model.generate(**inputs, min_length=10)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one thousand, nine hundred and ninety-four
+    ```
     """
 
     def __init__(self, min_length: int, eos_token_id: Union[int, List[int]]):
@@ -133,9 +159,7 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] enforcing a min-length of new tokens by setting EOS (End-Of-Sequence) token probability to 0.
-    Note that for decoder-only models, such as Llama2, `min_length` will compute the length of `prompt + newly
-    generated tokens` whereas for other models it will behave as `min_new_tokens`, that is, taking only into account
-    the newly generated ones.
+    Contrarily to [`MinLengthLogitsProcessor`], this processor ignores the prompt.
 
     Args:
         prompt_length_to_skip (`int`):
@@ -149,29 +173,21 @@ class MinNewTokensLengthLogitsProcessor(LogitsProcessor):
     Examples:
 
     ```python
-    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
-    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
-    >>> model.config.pad_token_id = model.config.eos_token_id
-    >>> inputs = tokenizer(["Hugging Face Company is"], return_tensors="pt")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer(["A number:"], return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one
 
-    >>> # If the maximum length (default = 20) is smaller than the minimum length constraint, the latter is ignored!
-    >>> outputs = model.generate(**inputs, min_new_tokens=30)
-    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-    Hugging Face Company is a company that has been working on a new product for the past year.
-
-    >>> # For testing purposes, let's set `eos_token` to `"company"`, the first generated token. This will make
-    >>> # generation end there.
-    >>> outputs = model.generate(**inputs, eos_token_id=1664)
-    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-    Hugging Face Company is a company
-
-    >>> # Increasing `min_new_tokens` will make generation ignore occurences `"company"` (eos token) before the
-    >>> # minimum length condition is honored.
-    >>> outputs = model.generate(**inputs, min_new_tokens=2, eos_token_id=1664)
-    >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-    Hugging Face Company is a new company
+    >>> # setting `min_new_tokens` will force the model to generate beyond its natural ending point, which is not
+    >>> # necessarily incorrect
+    >>> gen_out = model.generate(**inputs, min_new_tokens=2)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    A number: one thousand
     ```
     """
 
@@ -205,7 +221,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class TemperatureLogitsWarper(LogitsWarper):
     r"""
     [`LogitsWarper`] for temperature (exponential scaling output probability distribution), which effectively means
-    that it can control the randomness of the predicted tokens.
+    that it can control the randomness of the predicted tokens. Often used together with [`TopPLogitsWarper`] and
+    [`TopKLogitsWarper`].
 
     <Tip>
 
@@ -269,22 +286,18 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that prevents the repetition of previous tokens through an exponential penalty. This technique
-    shares some similarities with coverage mechanisms and other aimed at reducing repetition. During the text
-    generation process, the probability distribution for the next token is determined using a formula that incorporates
-    token scores based on their occurrence in the generated sequence. Tokens with higher scores are more likely to be
-    selected. The formula can be seen in the original [paper](https://arxiv.org/pdf/1909.05858.pdf). According to the
-    paper a penalty of around 1.2 yields a good balance between truthful generation and lack of repetition.
-
-    This technique can also be used to reward and thus encourage repetition in a similar manner. To penalize and reduce
+    [`LogitsProcessor`] that prevents the repetition of previous tokens through a penalty. This penalty is applied at
+    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt.
+
+    In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a penalty of around
+    1.2 to achieve a good balance between truthful generation and lack of repetition. To penalize and reduce
     repetition, use `penalty` values above 1.0, where a higher value penalizes more strongly. To reward and encourage
     repetition, use `penalty` values between 0.0 and 1.0, where a lower value rewards more strongly.
 
     Args:
         penalty (`float`):
             The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated
-            tokens. Between 0.0 and 1.0 rewards previously generated tokens. See [this
-            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
+            tokens. Between 0.0 and 1.0 rewards previously generated tokens.
 
     Examples:
 
@@ -327,20 +340,39 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class EncoderRepetitionPenaltyLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that avoids hallucination by boosting the probabilities of tokens found within the original
-    input.
+    [`LogitsProcessor`] that works similarly to [`RepetitionPenaltyLogitsProcessor`], but with an *inverse* penalty
+    that is applied to the tokens present in the prompt. In other words, a penalty above 1.0 increases the odds of
+    selecting tokens that were present in the prompt.
 
-    This technique can also be used to reward and thus encourage hallucination (or creativity) in a similar manner. To
-    penalize and reduce hallucination, use `penalty` values above 1.0, where a higher value penalizes more strongly. To
-    reward and encourage hallucination, use `penalty` values between 0.0 and 1.0, where a lower value rewards more
-    strongly.
+    It was designed to avoid hallucination in input-grounded tasks, like summarization. Although originally intended
+    for encoder-decoder models, it can also be used with decoder-only models like LLMs.
 
     Args:
         penalty (`float`):
-            The parameter for hallucination penalty. 1.0 means no penalty. Above 1.0 penalizes hallucination. Between
-            0.0 and 1.0 rewards hallucination.
+            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 rewards prompt tokens. Between 0.0
+            and 1.0 penalizes prompt tokens.
         encoder_input_ids (`torch.LongTensor`):
             The encoder_input_ids that should be repeated within the decoder ids.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer(["Alice and Bob. The third member's name was"], return_tensors="pt")
+    >>> gen_out = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    Alice and Bob. The third member's name was not mentioned.
+
+    >>> # With the `encoder_repetition_penalty` argument we can trigger this logits processor in `generate`, which can
+    >>> # promote the use of prompt tokens ("Bob" in this example)
+    >>> gen_out = model.generate(**inputs, encoder_repetition_penalty=1.2)
+    >>> print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+    Alice and Bob. The third member's name was Bob. The third member's name was Bob.
+    ```
     """
 
     def __init__(self, penalty: float, encoder_input_ids: torch.LongTensor):
@@ -363,7 +395,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class TopPLogitsWarper(LogitsWarper):
     """
-    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off.
+    [`LogitsWarper`] that performs top-p, i.e. restricting to top tokens summing to prob_cut_off <= prob_cut_off. Often
+    used together with [`TemperatureLogitsWarper`] and [`TopKLogitsWarper`].
 
     Args:
         top_p (`float`):
@@ -375,6 +408,7 @@ class TopPLogitsWarper(LogitsWarper):
             Minimum number of tokens that cannot be filtered.
 
     Examples:
+
     ```python
     >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
 
@@ -426,7 +460,8 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class TopKLogitsWarper(LogitsWarper):
     r"""
-    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements.
+    [`LogitsWarper`] that performs top-k, i.e. restricting to the k highest probability elements. Often used together
+    with [`TemperatureLogitsWarper`] and [`TopPLogitsWarper`].
 
     Args:
         top_k (`int`):
@@ -435,6 +470,29 @@ class TopKLogitsWarper(LogitsWarper):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> set_seed(0)
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: A, B, C, D", return_tensors="pt")
+
+    >>> # With sampling, the output is unexpected -- sometimes too unexpected.
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: A, B, C, D, G, H, I. A, M
+
+    >>> # With `top_k` sampling, the output gets restricted the k most likely tokens.
+    >>> # Pro tip: In practice, LLMs use `top_k` in the 5-50 range.
+    >>> outputs = model.generate(**inputs, do_sample=True, top_k=2)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    A sequence: A, B, C, D, E, F, G, H, I
+    ```
     """
 
     def __init__(self, top_k: int, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
@@ -455,8 +513,11 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class TypicalLogitsWarper(LogitsWarper):
     r"""
-    [`LogitsWarper`] that performs typical decoding. See [Typical Decoding for Natural Language
-    Generation](https://arxiv.org/abs/2202.00666) for more information.
+    [`LogitsWarper`] that performs typical decoding. Inspired on how humans use language, it prioritizes tokens whose
+    log probability is close to the entropy of the token probability distribution. This means that the most likely
+    tokens may be discarded in the process.
+
+    See [Typical Decoding for Natural Language Generation](https://arxiv.org/abs/2202.00666) for more information.
 
     Args:
         mass (`float`, *optional*, defaults to 0.9):
@@ -465,6 +526,42 @@ class TypicalLogitsWarper(LogitsWarper):
             All filtered values will be set to this float value.
         min_tokens_to_keep (`int`, *optional*, defaults to 1):
             Minimum number of tokens that cannot be filtered.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("1, 2, 3", return_tensors="pt")
+
+    >>> # We can see that greedy decoding produces a sequence of numbers
+    >>> outputs = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+
+    >>> # For this particular seed, we can see that sampling produces nearly the same low-information (= low entropy)
+    >>> # sequence
+    >>> set_seed(18)
+    >>> outputs = model.generate(**inputs, do_sample=True)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    1, 2, 3, 4, 5, 6, 7, 8, 9 and 10
+
+    >>> # With `typical_p` set, the most obvious sequence is no longer produced, which may be good for your problem
+    >>> set_seed(18)
+    >>> outputs = model.generate(
+    ...     **inputs, do_sample=True, typical_p=0.1, return_dict_in_generate=True, output_scores=True
+    ... )
+    >>> print(tokenizer.batch_decode(outputs.sequences, skip_special_tokens=True)[0])
+    1, 2, 3 and 5
+
+    >>> # We can see that the token corresponding to "4" (token 934) in the second position, the most likely token
+    >>> # as seen with greedy decoding, was entirely blocked out
+    >>> print(outputs.scores[1][0, 934])
+    tensor(-inf)
+    ```
     """
 
     def __init__(self, mass: float = 0.9, filter_value: float = -float("Inf"), min_tokens_to_keep: int = 1):
@@ -721,7 +818,8 @@ class NoRepeatNGramLogitsProcessor(LogitsProcessor):
     sentence: "She runs fast", the bi-grams (n=2) would be ("she", "runs") and ("runs", "fast"). In text generation,
     avoiding repetitions of word sequences provides a more diverse output. This [`LogitsProcessor`] enforces no
     repetition of n-grams by setting the scores of banned tokens to negative infinity which eliminates those tokens
-    from consideration when further processing the scores.
+    from consideration when further processing the scores. Note that, for decoder-only models like most LLMs, the
+    prompt is also considered to obtain the n-grams.
     [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345).
 
     <Tip>
@@ -774,14 +872,40 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class EncoderNoRepeatNGramLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces no repetition of encoder input ids n-grams for the decoder ids. See
-    [ParlAI](https://github.com/facebookresearch/ParlAI/blob/master/parlai/core/torch_generator_agent.py#L1350).
+    [`LogitsProcessor`] that works similarly to [`NoRepeatNGramLogitsProcessor`], but applied exclusively to prevent
+    the repetition of n-grams present in the prompt.
+
+    It was designed to promote chattiness in a language model, by preventing the generation of n-grams present in
+    previous conversation rounds.
 
     Args:
         encoder_ngram_size (`int`):
             All ngrams of size `ngram_size` can only occur within the encoder input ids.
         encoder_input_ids (`int`):
             The encoder_input_ids that should not be repeated within the decoder ids.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("Alice: I love cats. What do you love?\nBob:", return_tensors="pt")
+
+    >>> # With greedy decoding, we see Bob repeating Alice's opinion. If Bob was a chatbot, it would be a poor one.
+    >>> outputs = model.generate(**inputs)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice: I love cats. What do you love?
+    Bob: I love cats. What do you
+
+    >>> # With this logits processor, we can prevent Bob from repeating Alice's opinion.
+    >>> outputs = model.generate(**inputs, encoder_no_repeat_ngram_size=2)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice: I love cats. What do you love?
+    Bob: My cats are very cute.
+    ```
     """
 
     def __init__(self, encoder_ngram_size: int, encoder_input_ids: torch.LongTensor):
@@ -1060,6 +1184,40 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
             arguments `inputs_ids` and the batch ID `batch_id`. It has to return a list with the allowed tokens for the
             next generation step conditioned on the previously generated tokens `inputs_ids` and the batch ID
             `batch_id`.
+
+    Examples:
+
+    ```py
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("bigscience/bloomz-560m")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bigscience/bloomz-560m")
+
+    >>> inputs = tokenizer("Alice and Bob", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=5)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice and Bob are friends
+
+    >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
+    >>> # For instance, we can force an entire entity to be generated when its beginning is detected.
+    >>> entity =  tokenizer(" Bob Marley", return_tensors="pt").input_ids[0]  # 3 tokens
+    >>> def prefix_allowed_tokens_fn(batch_id, input_ids):
+    ...     '''
+    ...     Attempts to generate 'Bob Marley' when 'Bob' is detected.
+    ...     In this case, `batch_id` is not used, but you can set rules for each batch member.
+    ...     '''
+    ...     if input_ids[-1] == entity[0]:
+    ...         return entity[1]
+    ...     elif input_ids[-2] == entity[0] and input_ids[-1] == entity[1]:
+    ...         return entity[2]
+    ...     return list(range(tokenizer.vocab_size))  # If no match, allow all tokens
+
+    >>> outputs = model.generate(**inputs, max_new_tokens=5, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn)
+    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
+    Alice and Bob Marley
+    ```
     """
 
     def __init__(self, prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]], num_beams: int):
@@ -1071,7 +1229,14 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         mask = torch.full_like(scores, -math.inf)
         for batch_id, beam_sent in enumerate(input_ids.view(-1, self._num_beams, input_ids.shape[-1])):
             for beam_id, sent in enumerate(beam_sent):
-                mask[batch_id * self._num_beams + beam_id, self._prefix_allowed_tokens_fn(batch_id, sent)] = 0
+                prefix_allowed_tokens = self._prefix_allowed_tokens_fn(batch_id, sent)
+                if len(prefix_allowed_tokens) == 0:
+                    raise ValueError(
+                        f"`prefix_allowed_tokens_fn` returned an empty list for batch ID {batch_id}."
+                        f"This means that the constraint is unsatisfiable. Please check your implementation"
+                        f"of `prefix_allowed_tokens_fn` "
+                    )
+                mask[batch_id * self._num_beams + beam_id, prefix_allowed_tokens] = 0
 
         return scores + mask
 
@@ -1084,56 +1249,20 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
     Search: Decoding Diverse Solutions from Neural Sequence Models](https://arxiv.org/pdf/1610.02424.pdf) for more
     details.
 
-    <Tip>
-
-    Diverse beam search can be particularly useful in scenarios where a variety of different outputs is desired, rather
-    than multiple similar sequences. It allows the model to explore different generation paths and provides a broader
-    coverage of possible outputs.
-
-    </Tip>
-
-    <Tip warning={true}>
-
-    This logits processor can be resource-intensive, especially when using large models or long sequences.
-
-    </Tip>
-
     Traditional beam search often generates very similar sequences across different beams.
     `HammingDiversityLogitsProcessor` addresses this by penalizing beams that generate tokens already chosen by other
     beams in the same time step.
 
-    How It Works:
-    - **Grouping Beams**: Beams are divided into groups. Each group selects tokens independently of the others.
-    - **Penalizing Repeated Tokens**: If a beam in a group selects a token already chosen by another group in the
-        same step, a penalty is applied to that token's score.
-    - **Promoting Diversity**: This penalty discourages beams within a group from selecting the same tokens as
-        beams in other groups.
-
-    Benefits:
-    - **Diverse Outputs**: Produces a variety of different sequences.
-    - **Exploration**: Allows the model to explore different paths.
-
     Args:
         diversity_penalty (`float`):
             This value is subtracted from a beam's score if it generates a token same as any beam from other group at a
-            particular time. Note that `diversity_penalty` is only effective if group beam search is enabled. The
-            penalty applied to a beam's score when it generates a token that has already been chosen by another beam
-            within the same group during the same time step. A higher `diversity_penalty` will enforce greater
-            diversity among the beams, making it less likely for multiple beams to choose the same token. Conversely, a
-            lower penalty will allow beams to more freely choose similar tokens. Adjusting this value can help strike a
-            balance between diversity and natural likelihood.
+            particular time. A higher `diversity_penalty` will enforce greater diversity among the beams. Adjusting
+            this value can help strike a balance between diversity and natural likelihood.
         num_beams (`int`):
-            Number of beams used for group beam search. Beam search is a method used that maintains beams (or "multiple
-            hypotheses") at each step, expanding each one and keeping the top-scoring sequences. A higher `num_beams`
-            will explore more potential sequences. This can increase chances of finding a high-quality output but also
-            increases computational cost.
+            Number of beams for beam search. 1 means no beam search.
         num_beam_groups (`int`):
             Number of groups to divide `num_beams` into in order to ensure diversity among different groups of beams.
-            Each group of beams will operate independently, selecting tokens without considering the choices of other
-            groups. This division promotes diversity by ensuring that beams within different groups explore different
-            paths. For instance, if `num_beams` is 6 and `num_beam_groups` is 2, there will be 2 groups each containing
-            3 beams. The choice of `num_beam_groups` should be made considering the desired level of output diversity
-            and the total number of beams. See [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
+            [this paper](https://arxiv.org/pdf/1610.02424.pdf) for more details.
 
     Examples:
 
@@ -1146,7 +1275,13 @@ class HammingDiversityLogitsProcessor(LogitsProcessor):
     >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
 
     >>> # A long text about the solar system
-    >>> text = "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant interstellar molecular cloud."
+    >>> text = (
+    ...     "The Solar System is a gravitationally bound system comprising the Sun and the objects that orbit it, "
+    ...     "either directly or indirectly. Of the objects that orbit the Sun directly, the largest are the eight "
+    ...     "planets, with the remainder being smaller objects, such as the five dwarf planets and small Solar System "
+    ...     "bodies. The Solar System formed 4.6 billion years ago from the gravitational collapse of a giant "
+    ...     "interstellar molecular cloud."
+    ... )
     >>> inputs = tokenizer("summarize: " + text, return_tensors="pt")
 
     >>> # Generate diverse summary
@@ -1241,11 +1376,34 @@ def __call__(
 
 class ForcedBOSTokenLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] that enforces the specified token as the first generated token.
+    [`LogitsProcessor`] that enforces the specified token as the first generated token. Used with encoder-decoder
+    models.
 
     Args:
         bos_token_id (`int`):
             The id of the token to force as the first generated token.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-small")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-small")
+
+    >>> inputs = tokenizer("Translate from English to German: I love cats.", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=10)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    <pad> Ich liebe Kitty.</s>
+
+    >>> # We can use `forced_bos_token_id` to force the start of generation with an encoder-decoder model
+    >>> # (including forcing it to end straight away with an EOS token)
+    >>> outputs = model.generate(**inputs, max_new_tokens=10, forced_bos_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    <pad></s>
+    ```
     """
 
     def __init__(self, bos_token_id: int):
@@ -1271,6 +1429,27 @@ class ForcedEOSTokenLogitsProcessor(LogitsProcessor):
         eos_token_id (`Union[int, List[int]]`):
             The id of the token to force as the last generated token when `max_length` is reached. Optionally, use a
             list to set multiple *end-of-sequence* tokens.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
+
+    >>> # By default, it continues generating according to the model's logits
+    >>> outputs = model.generate(**inputs, max_new_tokens=10)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7, 8
+
+    >>> # `forced_eos_token_id` ensures the generation ends with a EOS token
+    >>> outputs = model.generate(**inputs, max_new_tokens=10, forced_eos_token_id=tokenizer.eos_token_id)
+    >>> print(tokenizer.batch_decode(outputs)[0])
+    A sequence: 1, 2, 3, 4, 5, 6, 7,<|endoftext|>
+    ```
     """
 
     def __init__(self, max_length: int, eos_token_id: Union[int, List[int]]):
@@ -1294,6 +1473,9 @@ class InfNanRemoveLogitsProcessor(LogitsProcessor):
     r"""
     [`LogitsProcessor`] that removes all `nan` and `inf` values to avoid the generation method to fail. Note that using
     the logits processor should only be used if necessary since it can slow down the generation method.
+
+    This logits processor has no `generate` example, as there shouldn't be a correct combination of flags that warrants
+    its use.
     """
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -1301,8 +1483,9 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         # set all nan values to 0.0
         scores[scores != scores] = 0.0
 
-        # set all inf values to max possible value
+        # set all +/-inf values to max/min possible value
         scores[scores == float("inf")] = torch.finfo(scores.dtype).max
+        scores[scores == float("-inf")] = torch.finfo(scores.dtype).min
 
         return scores
 
@@ -1404,6 +1587,29 @@ class LogitNormalization(LogitsProcessor, LogitsWarper):
     the scores during beam search, after applying the logits processors or warpers, since the search algorithm used in
     this library doesn't do it (it only does it before, but they may need re-normalization) but it still supposes that
     the scores are normalized when comparing the hypotheses.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+    >>> import torch
+
+    >>> model = AutoModelForCausalLM.from_pretrained("distilgpt2")
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
+
+    >>> inputs = tokenizer("A sequence: 1, 2, 3", return_tensors="pt")
+
+    >>> # By default, the scores are not normalized -- the sum of their exponentials is NOT a normalized probability
+    >>> # distribution, summing to 1
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(torch.sum(torch.exp(outputs.scores[-1])))
+    tensor(816.3250)
+
+    >>> # Normalizing them may have a positive impact on beam methods, or when using the scores on your application
+    >>> outputs = model.generate(**inputs, renormalize_logits=True, return_dict_in_generate=True, output_scores=True)
+    >>> print(torch.sum(torch.exp(outputs.scores[-1])))
+    tensor(1.0000)
+    ```
     """
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
@@ -1415,8 +1621,36 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
     r"""
     [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
-    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` at not
-    sampled at the begining of the generation.
+    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
+    not generated at the begining. Originally created for
+    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+
+    >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means
+    >>> # it can't generate and EOS token in the first iteration, but it can in the others.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(outputs.scores[1][0, 50256])  # 1 (and not 0) is the first freely generated token
+    tensor(-inf)
+    >>> print(outputs.scores[-1][0, 50256])  # in other places we can see some probability mass for EOS
+    tensor(29.9010)
+
+    >>> # If we disable `begin_suppress_tokens`, we can generate EOS in the first iteration.
+    >>> outputs = model.generate(
+    ...     **inputs, return_dict_in_generate=True, output_scores=True, begin_suppress_tokens=None
+    ... )
+    >>> print(outputs.scores[1][0, 50256])
+    tensor(11.2027)
+    ```
     """
 
     def __init__(self, begin_suppress_tokens, begin_index):
@@ -1432,8 +1666,33 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 
 class SuppressTokensLogitsProcessor(LogitsProcessor):
-    r"""This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so that they
-    are not sampled."""
+    r"""
+    This processor can be used to suppress a list of tokens. The processor will set their log probs to `-inf` so
+    that they are not generated. Originally created for
+    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+
+    >>> # Whisper has a long list of suppressed tokens. For instance, in this case, the token 1 is suppressed by default.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(outputs.scores[1][0, 1])  # 1 (and not 0) is the first freely generated token
+    tensor(-inf)
+
+    >>> # If we disable `suppress_tokens`, we can generate it.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, suppress_tokens=None)
+    >>> print(outputs.scores[1][0, 1])
+    tensor(5.7738)
+    ```
+    """
 
     def __init__(self, suppress_tokens):
         self.suppress_tokens = list(suppress_tokens)
@@ -1445,9 +1704,42 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 
 class ForceTokensLogitsProcessor(LogitsProcessor):
-    r"""This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
-    indices that will be forced before sampling. The processor will set their log probs to `inf` so that they are
-    sampled at their corresponding index."""
+    r"""
+    This processor takes a list of pairs of integers which indicates a mapping from generation indices to token
+    indices that will be forced before generation. The processor will set their log probs to `inf` so that they are
+    sampled at their corresponding index. Originally created for
+    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
+
+    Examples:
+    ```python
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+    >>> from datasets import load_dataset
+
+    >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+    >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+    >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+    >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+
+    >>> # This Whisper model forces the generation to start with `50362` at the first position by default, i.e.
+    >>> # `"forced_decoder_ids": [[1, 50362]]`. This means all other tokens are masked out.
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+    >>> print(
+    ...     all(outputs.scores[0][0, i] == float("-inf") for i in range(processor.tokenizer.vocab_size) if i != 50362)
+    ... )
+    True
+    >>> print(outputs.scores[0][0, 50362])
+    tensor(0.)
+
+    >>> # If we disable `forced_decoder_ids`, we stop seeing that effect
+    >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, forced_decoder_ids=None)
+    >>> print(
+    ...     all(outputs.scores[0][0, i] == float("-inf") for i in range(processor.tokenizer.vocab_size) if i != 50362)
+    ... )
+    False
+    >>> print(outputs.scores[0][0, 50362])
+    tensor(19.3140)
+    ```
+    """
 
     def __init__(self, force_token_map: List[List[int]]):
         self.force_token_map = dict(force_token_map)
@@ -1486,11 +1778,12 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
                 max_initial_timestamp_index (`int`, *optional*, defaults to 1):
                     Used to set the maximum value of the initial timestamp. This is used to prevent the model from
                     predicting timestamps that are too far in the future.
+        _detect_timestamp_from_logprob (`bool`, *optional*): Whether timestamps can be predicted from logprobs over all timestamps.
 
     Examples:
     ``` python
     >>> import torch
-    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration,GenerationConfig
+    >>> from transformers import AutoProcessor, WhisperForConditionalGeneration, GenerationConfig
     >>> from datasets import load_dataset
 
     >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
@@ -1501,7 +1794,7 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
 
     >>> #Displaying timestamps
     >>> generated_ids = model.generate(inputs=input_features, return_timestamps=True)
-    >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
+    >>> transcription = processor.batch_decode(generated_ids, decode_with_timestamps=True)[0]
     >>> print("Transcription:", transcription)
     Transcription: <|startoftranscript|><|0.00|> He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can<|6.44|><|6.44|> discover in it but little of rocky Ithaca.<|9.44|><|endoftext|>
 
@@ -1516,29 +1809,35 @@ class WhisperTimeStampLogitsProcessor(LogitsProcessor):
     ```
     """
 
-    def __init__(self, generate_config):  # support for the kwargs
+    def __init__(
+        self, generate_config, _detect_timestamp_from_logprob: Optional[bool] = None
+    ):  # support for the kwargs
         self.eos_token_id = generate_config.eos_token_id
         self.no_timestamps_token_id = generate_config.no_timestamps_token_id
         self.timestamp_begin = generate_config.no_timestamps_token_id + 1
 
-        self.begin_index = len(generate_config.forced_decoder_ids) + 2
-        if generate_config.forced_decoder_ids[-1][1] == self.no_timestamps_token_id:
-            self.begin_index -= 1
-        self.max_initial_timestamp_index = generate_config.max_initial_timestamp_index
+        # this variable is mostly just used for testing
+        self._detect_timestamp_from_logprob = (
+            _detect_timestamp_from_logprob
+            if _detect_timestamp_from_logprob is not None
+            else getattr(generate_config, "_detect_timestamp_from_logprob", True)
+        )
+
+        self.begin_index = (
+            len(generate_config.forced_decoder_ids) + 1 if generate_config.forced_decoder_ids is not None else 1
+        )
+        self.max_initial_timestamp_index = getattr(generate_config, "max_initial_timestamp_index", None)
 
     @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         # suppress <|notimestamps|> which is handled by without_timestamps
         scores[:, self.no_timestamps_token_id] = -float("inf")
 
-        if input_ids.shape[1] == self.begin_index - 1:
-            scores[:, :] = -float("inf")
-            scores[:, self.timestamp_begin] = 0
-            return scores
-
         # timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly
         for k in range(input_ids.shape[0]):
-            seq = list(input_ids[k, self.begin_index :].tolist())
+            sampled_tokens = input_ids[k, self.begin_index :]
+            seq = list(sampled_tokens.tolist())
+
             last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.timestamp_begin
             penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.timestamp_begin
 
@@ -1548,8 +1847,23 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
                 else:  # cannot be normal text tokens
                     scores[k, : self.eos_token_id] = -float("inf")
 
-            # apply the `max_initial_timestamp` option
-            if input_ids.shape[1] == self.begin_index and self.max_initial_timestamp_index is not None:
+            timestamps = sampled_tokens[sampled_tokens.ge(self.timestamp_begin)]
+            if timestamps.numel() > 0:
+                # `timestamps` shouldn't decrease; forbid timestamp tokens smaller than the last
+                # The following lines of code are copied from: https://github.com/openai/whisper/pull/914/files#r1137085090
+                if last_was_timestamp and not penultimate_was_timestamp:
+                    timestamp_last = timestamps[-1]
+                else:
+                    # Avoid to emit <|0.00|> again
+                    timestamp_last = timestamps[-1] + 1
+
+                scores[k, self.timestamp_begin : timestamp_last] = -float("inf")
+
+        # apply the `max_initial_timestamp` option
+        if input_ids.shape[1] == self.begin_index:
+            scores[:, : self.timestamp_begin] = -float("inf")
+
+            if self.max_initial_timestamp_index is not None:
                 last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
                 scores[:, last_allowed + 1 :] = -float("inf")
 
@@ -1558,25 +1872,49 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
         for k in range(input_ids.shape[0]):
             timestamp_logprob = logprobs[k, self.timestamp_begin :].logsumexp(dim=-1)
             max_text_token_logprob = logprobs[k, : self.timestamp_begin].max()
-            if timestamp_logprob > max_text_token_logprob:
+            if timestamp_logprob > max_text_token_logprob and self._detect_timestamp_from_logprob:
                 scores[k, : self.timestamp_begin] = -float("inf")
 
         return scores
 
 
 class ClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
-    r"""Logits processor for classifier free guidance (CFG). The scores are split over the batch dimension,
+    r"""
+    [`LogitsProcessor`] for classifier free guidance (CFG). The scores are split over the batch dimension,
     where the first half correspond to the conditional logits (predicted from the input prompt) and the second half
     correspond to the unconditional logits (predicted from an empty or 'null' prompt). The processor computes a
     weighted average across the conditional and unconditional logits, parameterised by the `guidance_scale`.
 
     See [the paper](https://arxiv.org/abs/2306.05284) for more information.
 
+    <Tip warning={true}>
+
+    This logits processor is exclusivelly compatible with
+    [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen)
+
+    </Tip>
+
     Args:
         guidance_scale (float):
             The guidance scale for classifier free guidance (CFG). CFG is enabled by setting `guidance_scale > 1`.
             Higher guidance scale encourages the model to generate samples that are more closely linked to the input
             prompt, usually at the expense of poorer quality.
+
+    Examples:
+
+    ```python
+    >>> from transformers import AutoProcessor, MusicgenForConditionalGeneration
+
+    >>> processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
+    >>> model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")
+
+    >>> inputs = processor(
+    ...     text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"],
+    ...     padding=True,
+    ...     return_tensors="pt",
+    ... )
+    >>> audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
+    ```
     """
 
     def __init__(self, guidance_scale):
@@ -1606,7 +1944,15 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 class AlternatingCodebooksLogitsProcessor(LogitsProcessor):
     r"""
-    [`LogitsProcessor`] enforcing alternated generation between the two codebooks of [`Bark`]'s fine submodel.
+    [`LogitsProcessor`] enforcing alternated generation between the two codebooks of Bark.
+
+    <Tip warning={true}>
+
+    This logits processor is exclusivelly compatible with
+    [Bark](https://huggingface.co/docs/transformers/en/model_doc/bark)'s fine submodel. See the model documentation
+    for examples.
+
+    </Tip>
 
     Args:
         input_start_len (`int`):
@@ -1641,10 +1987,10 @@ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> to
 
 
 class UnbatchedClassifierFreeGuidanceLogitsProcessor(LogitsProcessor):
-    r"""Logits processor for Classifier-Free Guidance (CFG). The processors
-    computes a weighted average across scores from prompt conditional and prompt unconditional (or negative) logits,
-    parameterized by the `guidance_scale`. The unconditional scores are computed internally by prompting `model` with
-    the `unconditional_ids` branch.
+    r"""
+    Logits processor for Classifier-Free Guidance (CFG). The processors computes a weighted average across scores
+    from prompt conditional and prompt unconditional (or negative) logits, parameterized by the `guidance_scale`.
+    The unconditional scores are computed internally by prompting `model` with the `unconditional_ids` branch.
 
     See [the paper](https://arxiv.org/abs/2306.17806) for more information.
 
@@ -1761,6 +2107,13 @@ def __call__(self, input_ids, scores):
 class BarkEosPrioritizerLogitsProcessor(LogitsProcessor):
     r"""This processor ensures that the EOS token is selected if its probability is greater than the `min_eos_p`.
 
+    <Tip warning={true}>
+
+    This logits processor is exclusivelly compatible with
+    [Bark](https://huggingface.co/docs/transformers/en/model_doc/bark). See the model documentation for examples.
+
+    </Tip>
+
     Args:
         eos_token_id (`Union[int, List[int]]`):
             The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
diff --git a/src/transformers/generation/tf_utils.py b/src/transformers/generation/tf_utils.py
index 59848c3c85905d..325e7e1cba2720 100644
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@@ -2268,6 +2268,8 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
 
         # 3. init tensors to use for "xla-compileable" generate function
         batch_size, num_beams, cur_len = shape_list(input_ids)
+        # store the prompt length of decoder
+        decoder_prompt_len = cur_len
 
         # per batch, beam-item holding current token in loop, pre-populated with `pad_token_id`
         input_ids_padding = tf.ones((batch_size, num_beams, max_length - cur_len), dtype=tf.int32) * (
@@ -2286,8 +2288,8 @@ def unflatten_beam_dim(tensor, num_beams, batch_axis=0):
         scores = tf.ones((batch_size, num_beams)) * -1.0e9
 
         # per batch beam indices
-        running_beam_indices = tf.ones((batch_size, num_beams, max_length), dtype=tf.int32) * -1
-        beam_indices = tf.ones((batch_size, num_beams, max_length), dtype=tf.int32) * -1
+        running_beam_indices = tf.ones((batch_size, num_beams, max_length - decoder_prompt_len), dtype=tf.int32) * -1
+        beam_indices = tf.ones((batch_size, num_beams, max_length - decoder_prompt_len), dtype=tf.int32) * -1
 
         # flatten beam dim
         if "encoder_outputs" in model_kwargs:
@@ -2308,6 +2310,7 @@ def beam_search_cond_fn(
             scores,
             beam_indices,
             is_sent_finished,
+            decoder_prompt_len,
             model_kwargs,
         ):
             """
@@ -2318,15 +2321,17 @@ def beam_search_cond_fn(
             not_max_length_yet = cur_len < max_length
 
             # 2. can the new beams still improve?
-            # early_stopping == False -> apply heuristic = always get the best score from `cur_len`. See the discussion
+            # early_stopping == False -> apply heuristic = always get the best score from `cur_len - decoder_prompt_len`. See the discussion
             # below for more details.
             # https://github.com/huggingface/transformers/pull/20901#issuecomment-1369845565
             # early_stopping == "never" -> compute the best score from max_length or cur_len, depending on the sign of
             #   length_penalty. Positive length_penalty favors longer sequences, thus we use max_length there.
             if early_stopping == "never" and length_penalty > 0.0:
-                best_running_score = running_scores[:, :1] / (max_length**length_penalty)
+                best_running_score = running_scores[:, :1] / ((max_length - decoder_prompt_len) ** length_penalty)
             else:
-                best_running_score = running_scores[:, :1] / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
+                best_running_score = running_scores[:, :1] / (
+                    tf.cast(cur_len - decoder_prompt_len, dtype=tf.float32) ** length_penalty
+                )
             worst_finished_score = tf.where(
                 is_sent_finished, tf.math.reduce_min(scores, axis=1, keepdims=True), -1.0e9
             )
@@ -2346,6 +2351,7 @@ def beam_search_body_fn(
             scores,
             beam_indices,
             is_sent_finished,
+            decoder_prompt_len,
             model_kwargs,
         ):
             """
@@ -2387,7 +2393,9 @@ def beam_search_body_fn(
                 if output_scores:
                     all_scores.append(
                         logits_warper(
-                            flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs_processed), cur_len
+                            flatten_beam_dim(running_sequences),
+                            flatten_beam_dim(log_probs_processed),
+                            cur_len,
                         )
                     )
                 if output_attentions and self.config.is_encoder_decoder:
@@ -2439,6 +2447,14 @@ def beam_search_body_fn(
             batch_modified_indices = topk_current_beam_indices + tf.broadcast_to(
                 tf.expand_dims(tf.range(batch_size) * num_beams, axis=1), topk_current_beam_indices.shape
             )
+            update_indices = tf.stack(
+                [
+                    indices_batch,
+                    indices_beam,
+                    tf.broadcast_to(cur_len - decoder_prompt_len, [batch_size * beams_to_keep]),
+                ],
+                axis=-1,
+            )
             topk_beam_indices = tf.tensor_scatter_nd_update(
                 tensor=topk_running_beam_indices,
                 indices=update_indices,
@@ -2455,7 +2471,8 @@ def beam_search_body_fn(
                 eos_in_next_token = tf.math.reduce_any(
                     tf.equal(
                         tf.broadcast_to(
-                            topk_sequences[:, :, cur_len], [len(eos_token_id)] + topk_sequences[:, :, cur_len].shape
+                            topk_sequences[:, :, cur_len],
+                            [len(eos_token_id)] + topk_sequences[:, :, cur_len].shape,
                         ),
                         tf.expand_dims(tf.expand_dims(eos_token_id, -1), -1),
                     ),
@@ -2483,7 +2500,9 @@ def beam_search_body_fn(
             # - add length penalty
             # - make sure no scores can be added anymore if beam is full
             # - make sure still running sequences cannot be chosen as finalized beam
-            topk_log_probs = topk_log_probs / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
+            topk_log_probs = topk_log_probs / (
+                tf.cast(cur_len + 1 - decoder_prompt_len, dtype=tf.float32) ** length_penalty
+            )
             beams_in_batch_are_full = tf.broadcast_to(
                 tf.math.reduce_all(is_sent_finished, axis=-1, keepdims=True), shape_list(did_topk_just_finished)
             ) & (early_stopping is True)
@@ -2546,6 +2565,7 @@ def beam_search_body_fn(
                 next_scores,
                 next_beam_indices,
                 next_is_sent_finished,
+                decoder_prompt_len,
                 next_model_kwargs,
             )
 
@@ -2560,6 +2580,7 @@ def beam_search_body_fn(
             scores,
             beam_indices,
             is_sent_finished,
+            decoder_prompt_len,
             model_kwargs,
         ) = beam_search_body_fn(
             cur_len,
@@ -2570,6 +2591,7 @@ def beam_search_body_fn(
             scores,
             beam_indices,
             is_sent_finished,
+            decoder_prompt_len,
             model_kwargs,
         )
 
@@ -2585,6 +2607,7 @@ def beam_search_body_fn(
             scores,
             beam_indices,
             is_sent_finished,
+            decoder_prompt_len,
             _,
         ) = tf.while_loop(
             beam_search_cond_fn,
@@ -2598,6 +2621,7 @@ def beam_search_body_fn(
                 scores,
                 beam_indices,
                 is_sent_finished,
+                decoder_prompt_len,
                 model_kwargs,
             ),
             maximum_iterations=maximum_iterations,
@@ -2611,7 +2635,7 @@ def beam_search_body_fn(
         beam_indices = tf.where(none_finished[:, None, None], beam_indices, running_beam_indices)
 
         # Apply the length penalty so that running scores match the finalized scores if they are used
-        running_scores = running_scores / (tf.cast(cur_len, dtype=tf.float32) ** length_penalty)
+        running_scores = running_scores / (tf.cast(cur_len - decoder_prompt_len, dtype=tf.float32) ** length_penalty)
         scores = tf.where(none_finished[:, None], scores, running_scores)
 
         # Take best beams for each batch (the score is sorted in descending order)
@@ -2622,7 +2646,7 @@ def beam_search_body_fn(
         if not use_xla:
             # Cut for backward compatibility
             sequences = sequences[:, :cur_len]
-            beam_indices = beam_indices[:, :cur_len]
+            beam_indices = beam_indices[:, : cur_len - decoder_prompt_len]
 
         if return_dict_in_generate:
             if self.config.is_encoder_decoder:
diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py
index 14e4b101291133..d7510951b116b1 100644
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@@ -24,6 +24,7 @@
 import torch.distributed as dist
 from torch import nn
 
+from ..cache_utils import Cache, DynamicCache
 from ..integrations.deepspeed import is_deepspeed_zero3_enabled
 from ..modeling_outputs import CausalLMOutputWithPast, Seq2SeqLMOutput
 from ..models.auto import (
@@ -36,6 +37,13 @@
 from ..utils import ExplicitEnum, ModelOutput, is_accelerate_available, logging
 from .beam_constraints import DisjunctiveConstraint, PhrasalConstraint
 from .beam_search import BeamScorer, BeamSearchScorer, ConstrainedBeamSearchScorer
+from .candidate_generator import (
+    AssistedCandidateGenerator,
+    CandidateGenerator,
+    _crop_past_key_values,
+    _prepare_attention_mask,
+    _prepare_token_type_ids,
+)
 from .configuration_utils import GenerationConfig
 from .logits_process import (
     EncoderNoRepeatNGramLogitsProcessor,
@@ -888,6 +896,28 @@ def _reorder_cache(self, past_key_values, beam_idx):
             f" enable beam search for {self.__class__}"
         )
 
+    def _get_candidate_generator(
+        self,
+        generation_config: GenerationConfig,
+        input_ids: torch.LongTensor,
+        inputs_tensor: torch.Tensor,
+        assistant_model: "PreTrainedModel",
+        logits_processor: LogitsProcessorList,
+        model_kwargs: Dict,
+    ) -> CandidateGenerator:
+        """
+        Returns the candidate generator to be used in `assisted_generation`
+        """
+        candidate_generator = AssistedCandidateGenerator(
+            input_ids=input_ids,
+            assistant_model=assistant_model,
+            logits_processor=logits_processor,
+            model_kwargs=model_kwargs,
+            inputs_tensor=inputs_tensor,
+            eos_token_id=generation_config.eos_token_id,
+        )
+        return candidate_generator
+
     def _get_logits_warper(
         self,
         generation_config: GenerationConfig,
@@ -1031,16 +1061,9 @@ def _get_logits_processor(
             generation_config.encoder_no_repeat_ngram_size is not None
             and generation_config.encoder_no_repeat_ngram_size > 0
         ):
-            if self.config.is_encoder_decoder:
-                processors.append(
-                    EncoderNoRepeatNGramLogitsProcessor(
-                        generation_config.encoder_no_repeat_ngram_size, encoder_input_ids
-                    )
-                )
-            else:
-                raise ValueError(
-                    "It's impossible to use `encoder_no_repeat_ngram_size` with decoder-only architecture"
-                )
+            processors.append(
+                EncoderNoRepeatNGramLogitsProcessor(generation_config.encoder_no_repeat_ngram_size, encoder_input_ids)
+            )
         if generation_config.bad_words_ids is not None:
             processors.append(
                 NoBadWordsLogitsProcessor(generation_config.bad_words_ids, generation_config.eos_token_id)
@@ -1217,9 +1240,10 @@ def compute_transition_scores(
         ...     outputs.sequences, outputs.scores, outputs.beam_indices, normalize_logits=False
         ... )
         >>> # If you sum the generated tokens' scores and apply the length penalty, you'll get the sequence scores.
-        >>> # Tip: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
+        >>> # Tip 1: recomputing the scores is only guaranteed to match with `normalize_logits=False`. Depending on the
         >>> # use case, you might want to recompute it with `normalize_logits=True`.
-        >>> output_length = input_length + np.sum(transition_scores.numpy() < 0, axis=1)
+        >>> # Tip 2: the output length does NOT include the input length
+        >>> output_length = np.sum(transition_scores.numpy() < 0, axis=1)
         >>> length_penalty = model.generation_config.length_penalty
         >>> reconstructed_scores = transition_scores.sum(axis=1) / (output_length**length_penalty)
         >>> print(np.allclose(outputs.sequences_scores, reconstructed_scores))
@@ -1293,6 +1317,13 @@ def _validate_model_class(self):
 
     def _validate_model_kwargs(self, model_kwargs: Dict[str, Any]):
         """Validates model kwargs for generation. Generate argument typos will also be caught here."""
+        # If a `Cache` instance is passed, checks whether the model is compatible with it
+        if isinstance(model_kwargs.get("past_key_values", None), Cache) and not self._supports_cache_class:
+            raise ValueError(
+                f"{self.__class__.__name__} does not support an instance of `Cache` as `past_key_values`. Please "
+                "check the model documentation for supported cache formats."
+            )
+
         # Excludes arguments that are handled before calling any model function
         if self.config.is_encoder_decoder:
             for key in ["decoder_input_ids"]:
@@ -1390,43 +1421,6 @@ def _validate_generated_length(self, generation_config, input_ids_length, has_de
                     UserWarning,
                 )
 
-    def _extend_attention_mask(self, model_kwargs: Dict[str, Any], new_mask_length: int) -> Dict[str, Any]:
-        if self.config.is_encoder_decoder:
-            key = "decoder_attention_mask"
-        else:
-            key = "attention_mask"
-
-        if key not in model_kwargs:
-            return model_kwargs
-
-        mask = model_kwargs[key]
-        mask_extension_length = new_mask_length - mask.shape[1]
-
-        if mask_extension_length < 0:
-            raise ValueError("Cannot extend attention mask to a length less than it already is")
-
-        model_kwargs[key] = torch.cat(
-            [mask, mask.new_ones((mask.shape[0], mask_extension_length))],
-            dim=-1,
-        )
-
-        return model_kwargs
-
-    def _extend_token_type_ids(self, model_kwargs: Dict[str, Any], new_length: int) -> Dict[str, Any]:
-        if "token_type_ids" not in model_kwargs or model_kwargs["token_type_ids"] is None:
-            return model_kwargs
-
-        token_type_ids = model_kwargs["token_type_ids"]
-        final_token_type = token_type_ids[:, -1].unsqueeze(-1)
-        extension_length = new_length - token_type_ids.shape[1]
-        token_type_copies = final_token_type.repeat(1, extension_length)
-        model_kwargs["token_type_ids"] = torch.cat(
-            [model_kwargs["token_type_ids"], token_type_copies],
-            dim=-1,
-        )
-
-        return model_kwargs
-
     @torch.no_grad()
     def generate(
         self,
@@ -1706,36 +1700,20 @@ def generate(
             if not model_kwargs["use_cache"]:
                 raise ValueError("assisted generate requires `use_cache=True`")
 
-            assistant_accepts_encoder_outputs = "encoder_outputs" in set(
-                inspect.signature(assistant_model.forward).parameters.keys()
+            # 11. Get the candidate generator, given the parameterization
+            candidate_generator = self._get_candidate_generator(
+                generation_config=generation_config,
+                input_ids=input_ids,
+                inputs_tensor=inputs_tensor,
+                assistant_model=assistant_model,
+                logits_processor=logits_processor,
+                model_kwargs=model_kwargs,
             )
 
-            # 11. If the assistant model is an encoder-decoder, prepare its encoder outputs
-            if assistant_model.config.is_encoder_decoder and "assistant_encoder_outputs" not in model_kwargs:
-                assistant_model_kwargs = copy.deepcopy(model_kwargs)
-                inputs_tensor, model_input_name, assistant_model_kwargs = assistant_model._prepare_model_inputs(
-                    inputs_tensor, assistant_model.generation_config.bos_token_id, assistant_model_kwargs
-                )
-                assistant_model_kwargs = assistant_model._prepare_encoder_decoder_kwargs_for_generation(
-                    inputs_tensor, assistant_model_kwargs, model_input_name
-                )
-                model_kwargs["assistant_encoder_outputs"] = assistant_model_kwargs["encoder_outputs"]
-
-            if (
-                not assistant_model.config.is_encoder_decoder
-                and assistant_accepts_encoder_outputs
-                and "encoder_outputs" in model_kwargs
-            ):
-                # some assistants might be assymetric (many more enc layers than dec layers)
-                # encoder-decoder models that share the exact same encoder as the teacher
-                # in this case the assistant only needs to load the light-weight decoder,
-                # but still requires `encoder_outputs` to be passed
-                model_kwargs["assistant_encoder_outputs"] = model_kwargs["encoder_outputs"]
-
             # 12. run assisted generate
             return self.assisted_decoding(
                 input_ids,
-                assistant_model=assistant_model,
+                candidate_generator=candidate_generator,
                 do_sample=generation_config.do_sample,
                 logits_processor=logits_processor,
                 logits_warper=self._get_logits_warper(generation_config) if generation_config.do_sample else None,
@@ -2837,7 +2815,7 @@ def sample(
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
@@ -2988,6 +2966,32 @@ def sample(
         else:
             return input_ids
 
+    def _temporary_reorder_cache(self, past_key_values, beam_idx):
+        """
+        Temporary function to handle the different types of cache reordering processes while we roll out `Cache`.
+
+        TODO: standardize cache formats and make all models compatible with `Cache`. It would remove the need
+        for this function, with `Cache.reorder_cache` being the sole remaining code path
+        """
+        model_class = self.__class__.__name__.lower()
+        # Exception 1: code path for models using the legacy cache format
+        if isinstance(past_key_values, (tuple, list)):
+            past_key_values = self._reorder_cache(past_key_values, beam_idx)
+        # Exception 2: models with different cache formats. These are limited to `DynamicCache` until their
+        # cache format is standardized, to avoid adding complexity to the codebase.
+        elif "bloom" in model_class or "gptbigcode" in model_class:
+            if not isinstance(past_key_values, DynamicCache):
+                raise ValueError(
+                    f"Using an unsupported cache format with {model_class}. Currently, it only supports the "
+                    "legacy tuple format or `DynamicCache`"
+                )
+            past_key_values = self._reorder_cache(past_key_values, beam_idx)
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+        # Standard code path: use the `Cache.reorder_cache`
+        else:
+            past_key_values.reorder_cache(beam_idx)
+        return past_key_values
+
     def beam_search(
         self,
         input_ids: torch.LongTensor,
@@ -3116,7 +3120,7 @@ def beam_search(
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
@@ -3261,7 +3265,9 @@ def beam_search(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
 
             if return_dict_in_generate and output_scores:
                 beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
@@ -3458,7 +3464,7 @@ def beam_sample(
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
@@ -3596,7 +3602,9 @@ def beam_sample(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
 
             if return_dict_in_generate and output_scores:
                 beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
@@ -3786,7 +3794,7 @@ def group_beam_search(
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
@@ -3981,7 +3989,7 @@ def group_beam_search(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
                     model_kwargs["past_key_values"], reordering_indices
                 )
 
@@ -4176,7 +4184,7 @@ def constrained_beam_search(
         if max_length is not None:
             warnings.warn(
                 "`max_length` is deprecated in this function, use"
-                " `stopping_criteria=StoppingCriteriaList(MaxLengthCriteria(max_length=max_length))` instead.",
+                " `stopping_criteria=StoppingCriteriaList([MaxLengthCriteria(max_length=max_length)])` instead.",
                 UserWarning,
             )
             stopping_criteria = validate_stopping_criteria(stopping_criteria, max_length)
@@ -4323,7 +4331,9 @@ def constrained_beam_search(
                 outputs, model_kwargs, is_encoder_decoder=self.config.is_encoder_decoder
             )
             if model_kwargs["past_key_values"] is not None:
-                model_kwargs["past_key_values"] = self._reorder_cache(model_kwargs["past_key_values"], beam_idx)
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
 
             if return_dict_in_generate and output_scores:
                 beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
@@ -4381,7 +4391,8 @@ def constrained_beam_search(
     def assisted_decoding(
         self,
         input_ids: torch.LongTensor,
-        assistant_model: "PreTrainedModel",
+        assistant_model: Optional["PreTrainedModel"] = None,
+        candidate_generator: Optional["CandidateGenerator"] = None,
         do_sample: bool = False,
         logits_processor: Optional[LogitsProcessorList] = None,
         logits_warper: Optional[LogitsProcessorList] = None,
@@ -4398,12 +4409,13 @@ def assisted_decoding(
     ):
         r"""
         Generates sequences of token ids for models with a language modeling head using **greedy decoding** or
-        **sample** (depending on `do_sample`), assisted by a smaller model. Can be used for text-decoder, text-to-text,
-        speech-to-text, and vision-to-text models.
+        **sample** (depending on `do_sample`), assisted by candidate sequences. Assisted generation is an example of a
+        candidate decoding strategy. Can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text
+        models.
 
         <Tip warning={true}>
 
-        In most cases, you do not need to call [`~generation.GenerationMixin.assisted_decoding`] directly. Use
+        In most cases, you do not need to call [`~generation.GenerationMixin.candidate_decoding`] directly. Use
         generate() instead. For an overview of generation strategies and code examples, check the [following
         guide](../generation_strategies).
 
@@ -4412,6 +4424,9 @@ def assisted_decoding(
         Parameters:
             input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                 The sequence used as a prompt for the generation.
+            candidate_generator (`CandidateGenerator`, *optional*):
+                A derived instance of [`CandidateGenerator`] that defines how candidate sequences are generated. For
+                more information, the documentation of [`CandidateGenerator`] should be read. Only one of `assistant_model` or `candidate_generator` should be passed as input to this function.
             assistant_model (`PreTrainedModel`, *optional*):
                 An assistant model that can be used to accelerate generation. The assistant model must have the exact
                 same tokenizer. The acceleration is achieved when forecasting candidate tokens with the assistent model
@@ -4494,20 +4509,23 @@ def assisted_decoding(
         >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
         ["It might be possible to get a better understanding of the nature of the problem, but it's not"]
         ```"""
-        # Assistant: initialize assistant-related variables
-        if hasattr(assistant_model, "num_assistant_tokens"):
+        # handling deprecated arguments
+        if (assistant_model is None) == (candidate_generator is None):
+            raise ValueError("One (and only one) of `assistant_model` and `candidate_generator` should be defined.")
+
+        if assistant_model is not None:
+            candidate_generator = AssistedCandidateGenerator(
+                input_ids=input_ids,
+                assistant_model=assistant_model,
+                logits_processor=logits_processor,
+                model_kwargs=model_kwargs,
+                eos_token_id=eos_token_id,
+            )
             warnings.warn(
-                "Setting `num_assistant_tokens` via `assistant_model.num_assistant_tokens` is deprecated and will be removed in v.37. Make sure to set `num_assistant_tokens` via the generation_config instead.",
+                "Passing `assistant_model` to `assisted_decoding` is deprecated and will be removed in v4.38. "
+                "Pass the `candidate_generator` argument instead.",
                 FutureWarning,
             )
-            num_assistant_tokens = assistant_model.num_assistant_tokens
-        else:
-            num_assistant_tokens = assistant_model.generation_config.num_assistant_tokens
-
-        # check if assistant model accepts encoder_outputs
-        assistant_accepts_encoder_outputs = "encoder_outputs" in set(
-            inspect.signature(assistant_model.forward).parameters.keys()
-        )
 
         # init values
         logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
@@ -4551,15 +4569,6 @@ def assisted_decoding(
 
         # other auxiliary variables
         max_len = stopping_criteria[0].max_length
-        assistant_kv_indexing = (
-            1
-            if "bloom" in assistant_model.__class__.__name__.lower()
-            or (
-                assistant_model.config.architectures is not None
-                and "bloom" in assistant_model.config.architectures[0].lower()
-            )
-            else 0
-        )
 
         this_peer_finished = False  # used by synced_gpus only
         while True:
@@ -4573,71 +4582,18 @@ def assisted_decoding(
                 if this_peer_finished_flag.item() == 0.0:
                     break
 
-            # Assistant: main logic start
             cur_len = input_ids.shape[-1]
 
-            #  1. Forecast next N tokens using the assistant model. This `for` block can be replaced with a
-            # `.generate()` call if we decide to add `past_key_values` as a possible output of generate, as we
-            # need access to the assistant cache to secure strong speedups.
-            candidate_input_ids = input_ids
-            for _ in range(int(num_assistant_tokens)):
-                # 1.1. use the assistant model to obtain the next candidate logits
-                if "assistant_past_key_values" in model_kwargs:
-                    prev_seq_len = model_kwargs["assistant_past_key_values"][0][assistant_kv_indexing].shape[-2]
-                    # `new_token_len` can be 1 or 2 (next token in assistant + last token picked by the larger model)
-                    new_token_len = candidate_input_ids.shape[1] - prev_seq_len
-                    assist_inputs = candidate_input_ids[:, -new_token_len:]
-                    # TODO (joao): make it compatible with models that use unconventional fwd pass logic, like blip2
-                    if assistant_model.config.is_encoder_decoder:
-                        assistant_model_outputs = assistant_model(
-                            decoder_input_ids=assist_inputs,
-                            past_key_values=model_kwargs["assistant_past_key_values"],
-                            encoder_outputs=model_kwargs["assistant_encoder_outputs"],
-                        )
-                    else:
-                        encoder_kwargs = {}
-
-                        if assistant_accepts_encoder_outputs and "assistant_encoder_outputs" in model_kwargs:
-                            encoder_kwargs["encoder_outputs"] = model_kwargs["assistant_encoder_outputs"]
-
-                        assistant_model_outputs = assistant_model(
-                            assist_inputs, past_key_values=model_kwargs["assistant_past_key_values"], **encoder_kwargs
-                        )
-                else:
-                    if assistant_model.config.is_encoder_decoder:
-                        assistant_model_outputs = assistant_model(
-                            decoder_input_ids=candidate_input_ids,
-                            encoder_outputs=model_kwargs["assistant_encoder_outputs"],
-                        )
-                    else:
-                        encoder_kwargs = {}
-
-                        if assistant_accepts_encoder_outputs and "assistant_encoder_outputs" in model_kwargs:
-                            encoder_kwargs["encoder_outputs"] = model_kwargs["assistant_encoder_outputs"]
-
-                        assistant_model_outputs = assistant_model(candidate_input_ids, **encoder_kwargs)
-
-                # 1.2. greedily select the next candidate token
-                model_kwargs["assistant_past_key_values"] = assistant_model_outputs.past_key_values
-                if len(logits_processor) > 0:
-                    assistant_model_outputs.logits[:, -1, :] = logits_processor(
-                        candidate_input_ids, assistant_model_outputs.logits[:, -1, :]
-                    )
-                new_token = assistant_model_outputs.logits[:, -1, :].argmax(dim=-1)
-                candidate_input_ids = torch.cat((candidate_input_ids, new_token[:, None]), dim=-1)
-
-                # 1.3. stop assistant generation on EOS
-                if eos_token_id_tensor is not None:
-                    last_assistant_token_is_eos = new_token.tile(eos_token_id_tensor.shape[0], 1)
-                    last_assistant_token_is_eos = (
-                        ~last_assistant_token_is_eos.ne(eos_token_id_tensor.unsqueeze(1)).prod(dim=0).bool()
-                    )
-                    if last_assistant_token_is_eos:
-                        break
-                else:
-                    last_assistant_token_is_eos = False
-
+            #  1. Fetch candidate sequences from a `CandidateGenerator`
+            candidate_input_ids = candidate_generator.get_candidates(input_ids)
             candidate_length = candidate_input_ids.shape[1] - input_ids.shape[1]
+            last_assistant_token_is_eos = (
+                ~candidate_input_ids[:, -1]
+                .tile(eos_token_id_tensor.shape[0], 1)
+                .ne(eos_token_id_tensor.unsqueeze(1))
+                .prod(dim=0)
+                .bool()
+            )
 
             # 2. Use the original model to obtain the next token logits given the candidate sequence. We obtain
             # `candidate_length + 1` relevant logits from this process: in the event that all candidates are correct,
@@ -4645,8 +4601,10 @@ def assisted_decoding(
 
             # 2.1. Prepare the model inputs
             candidate_kwargs = copy.copy(model_kwargs)
-            candidate_kwargs = self._extend_attention_mask(candidate_kwargs, candidate_input_ids.shape[1])
-            candidate_kwargs = self._extend_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
+            candidate_kwargs = _prepare_attention_mask(
+                candidate_kwargs, candidate_input_ids.shape[1], self.config.is_encoder_decoder
+            )
+            candidate_kwargs = _prepare_token_type_ids(candidate_kwargs, candidate_input_ids.shape[1])
 
             model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
 
@@ -4698,20 +4656,10 @@ def assisted_decoding(
             # 5.3. Discard past key values relative to unused assistant tokens
             new_cache_size = new_cur_len - 1
             outputs.past_key_values = _crop_past_key_values(self, outputs.past_key_values, new_cache_size)
-            model_kwargs["assistant_past_key_values"] = _crop_past_key_values(
-                assistant_model, model_kwargs["assistant_past_key_values"], new_cache_size - 1
-            )  # the assistant does not have the token after the last match, hence the -1
-
-            # 6. Adjust the max number of assistant tokens to use in the next iteration. This is a simple heuristic,
-            # probably can be improved -- we want to balance the benefits of getting assistant tokens correct with the
-            # cost of forecasting incorrect assistant tokens.
-            if assistant_model.generation_config.num_assistant_tokens_schedule == "heuristic":
-                if n_matches == int(num_assistant_tokens):
-                    num_assistant_tokens += 2.0
-                else:
-                    num_assistant_tokens = max(1.0, num_assistant_tokens - 1.0)
 
-            # Assistant: main logic end
+            # 6. Update the candidate generation strategy if needed
+            candidate_generator.update_candidate_strategy(input_ids, new_logits, n_matches)
+
             if synced_gpus and this_peer_finished:
                 continue  # don't waste resources running the code we don't need
 
@@ -4807,54 +4755,6 @@ def assisted_decoding(
             return input_ids
 
 
-def _crop_past_key_values(model, past_key_values, maximum_length):
-    """Crops the past key values up to a certain maximum length."""
-    new_past = []
-    if model.config.is_encoder_decoder:
-        for idx in range(len(past_key_values)):
-            new_past.append(
-                (
-                    past_key_values[idx][0][:, :, :maximum_length, :],
-                    past_key_values[idx][1][:, :, :maximum_length, :],
-                    past_key_values[idx][2],
-                    past_key_values[idx][3],
-                )
-            )
-        past_key_values = tuple(new_past)
-    # bloom is special
-    elif "bloom" in model.__class__.__name__.lower() or (
-        model.config.architectures is not None and "bloom" in model.config.architectures[0].lower()
-    ):
-        for idx in range(len(past_key_values)):
-            new_past.append(
-                (
-                    past_key_values[idx][0][:, :, :maximum_length],
-                    past_key_values[idx][1][:, :maximum_length, :],
-                )
-            )
-        past_key_values = tuple(new_past)
-    # gptbigcode is too
-    elif "gptbigcode" in model.__class__.__name__.lower() or (
-        model.config.architectures is not None and "gptbigcode" in model.config.architectures[0].lower()
-    ):
-        if model.config.multi_query:
-            for idx in range(len(past_key_values)):
-                past_key_values[idx] = past_key_values[idx][:, :maximum_length, :]
-        else:
-            for idx in range(len(past_key_values)):
-                past_key_values[idx] = past_key_values[idx][:, :, :maximum_length, :]
-    else:
-        for idx in range(len(past_key_values)):
-            new_past.append(
-                (
-                    past_key_values[idx][0][:, :, :maximum_length, :],
-                    past_key_values[idx][1][:, :, :maximum_length, :],
-                )
-            )
-        past_key_values = tuple(new_past)
-    return past_key_values
-
-
 def _split_model_outputs(outputs, new_outputs, cur_len, added_len, is_decoder_attention=False):
     """
     Given the (decoder/cross attentions)/(decoder hidden states) for multiple generated tokens, splits it into a tuple
diff --git a/src/transformers/hyperparameter_search.py b/src/transformers/hyperparameter_search.py
index 8dfd60cc39cd31..c14165165ca1f9 100644
--- a/src/transformers/hyperparameter_search.py
+++ b/src/transformers/hyperparameter_search.py
@@ -15,7 +15,7 @@
 
 from .integrations import (
     is_optuna_available,
-    is_ray_available,
+    is_ray_tune_available,
     is_sigopt_available,
     is_wandb_available,
     run_hp_search_optuna,
@@ -81,7 +81,7 @@ class RayTuneBackend(HyperParamSearchBackendBase):
 
     @staticmethod
     def is_available():
-        return is_ray_available()
+        return is_ray_tune_available()
 
     def run(self, trainer, n_trials: int, direction: str, **kwargs):
         return run_hp_search_ray(trainer, n_trials, direction, **kwargs)
diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py
index bfb88d03d3fc4d..5d280bf5e2b49a 100644
--- a/src/transformers/image_utils.py
+++ b/src/transformers/image_utils.py
@@ -15,6 +15,7 @@
 
 import base64
 import os
+from enum import EnumMeta
 from io import BytesIO
 from typing import TYPE_CHECKING, Dict, Iterable, List, Optional, Tuple, Union
 
@@ -29,6 +30,7 @@
     is_torch_available,
     is_torch_tensor,
     is_vision_available,
+    logging,
     requires_backends,
     to_numpy,
 )
@@ -56,6 +58,9 @@
         import torch
 
 
+logger = logging.get_logger(__name__)
+
+
 ImageInput = Union[
     "PIL.Image.Image", np.ndarray, "torch.Tensor", List["PIL.Image.Image"], List[np.ndarray], List["torch.Tensor"]
 ]  # noqa
@@ -66,6 +71,28 @@ class ChannelDimension(ExplicitEnum):
     LAST = "channels_last"
 
 
+class AnnotationFormat(ExplicitEnum):
+    COCO_DETECTION = "coco_detection"
+    COCO_PANOPTIC = "coco_panoptic"
+
+
+class DeprecatedEnumMeta(EnumMeta):
+    def __init__(cls, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        logger.warning_once(
+            f"`{cls.__name__}` is deprecated and will be removed in v4.38. "
+            f"Please use `transformers.image_utils.AnnotationFormat` instead."
+        )
+
+
+class AnnotionFormat(ExplicitEnum, metaclass=DeprecatedEnumMeta):
+    COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
+    COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
+
+
+AnnotationType = Dict[str, Union[int, str, List[Dict]]]
+
+
 def is_pil_image(img):
     return is_vision_available() and isinstance(img, PIL.Image.Image)
 
@@ -664,3 +691,33 @@ def rotate(self, image, angle, resample=None, expand=0, center=None, translate=N
         return image.rotate(
             angle, resample=resample, expand=expand, center=center, translate=translate, fillcolor=fillcolor
         )
+
+
+def promote_annotation_format(annotation_format: Union[AnnotionFormat, AnnotationFormat]) -> AnnotationFormat:
+    # can be removed when `AnnotionFormat` is fully deprecated
+    return AnnotationFormat(annotation_format.value)
+
+
+def validate_annotations(
+    annotation_format: AnnotationFormat,
+    supported_annotation_formats: Tuple[AnnotationFormat, ...],
+    annotations: List[Dict],
+) -> None:
+    if promote_annotation_format(annotation_format) not in supported_annotation_formats:
+        raise ValueError(f"Unsupported annotation format: {format} must be one of {supported_annotation_formats}")
+
+    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_DETECTION:
+        if not valid_coco_detection_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO detection annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
+                "being a list of annotations in the COCO format."
+            )
+
+    if promote_annotation_format(annotation_format) is AnnotationFormat.COCO_PANOPTIC:
+        if not valid_coco_panoptic_annotations(annotations):
+            raise ValueError(
+                "Invalid COCO panoptic annotations. Annotations must a dict (single image) or list of dicts "
+                "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
+                "the latter being a list of annotations in the COCO format."
+            )
diff --git a/src/transformers/integrations/__init__.py b/src/transformers/integrations/__init__.py
index 427b5e00000feb..3d1e41263eef70 100644
--- a/src/transformers/integrations/__init__.py
+++ b/src/transformers/integrations/__init__.py
@@ -17,7 +17,7 @@
 
 
 _import_structure = {
-    "awq": ["replace_with_awq_linear"],
+    "awq": ["fuse_awq_modules", "replace_with_awq_linear"],
     "bitsandbytes": [
         "get_keys_to_not_convert",
         "replace_8bit_linear",
@@ -80,7 +80,7 @@
 }
 
 if TYPE_CHECKING:
-    from .awq import replace_with_awq_linear
+    from .awq import fuse_awq_modules, replace_with_awq_linear
     from .bitsandbytes import (
         get_keys_to_not_convert,
         replace_8bit_linear,
diff --git a/src/transformers/integrations/awq.py b/src/transformers/integrations/awq.py
index 94d996b0fffd4c..336a216e401461 100644
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@@ -12,14 +12,33 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 "AWQ (Activation aware Weight Quantization) integration file"
+from ..activations import ACT2FN
+from ..modeling_utils import PreTrainedModel
 from ..utils import is_auto_awq_available, is_torch_available
-from ..utils.quantization_config import AwqBackendPackingMethod, AWQLinearVersion
+from ..utils.quantization_config import AwqBackendPackingMethod, AwqConfig, AWQLinearVersion
 
 
 if is_torch_available():
+    import torch
     import torch.nn as nn
 
 
+AWQ_FUSED_MAPPINGS = {
+    "mistral": {
+        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+        "mlp": ["gate_proj", "up_proj", "down_proj"],
+        "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
+        "use_alibi": False,
+    },
+    "llama": {
+        "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+        "mlp": ["gate_proj", "up_proj", "down_proj"],
+        "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"],
+        "use_alibi": False,
+    },
+}
+
+
 def replace_with_awq_linear(
     model,
     modules_to_not_convert=None,
@@ -102,3 +121,219 @@ def replace_with_awq_linear(
         # Remove the last key for recursion
         current_key_name.pop(-1)
     return model, has_been_replaced
+
+
+def get_modules_to_fuse(model, quantization_config):
+    """
+    Returns the fusing mapping given the quantization config and the model
+
+    Args:
+        model (`~PreTrainedModel`):
+            The model to fuse - note this model should have been converted into AWQ format beforehand.
+        quantization_config (`~transformers.quantization_config.AWQConfig`):
+            The quantization configuration to use.
+    """
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError(f"The model should be an instance of `PreTrainedModel`, got {model.__class__.__name__}")
+
+    # Always default to `quantization_config.modules_to_fuse`
+    if quantization_config.modules_to_fuse is not None:
+        current_fused_mapping = quantization_config.modules_to_fuse
+        current_fused_mapping["max_seq_len"] = quantization_config.fuse_max_seq_len
+    elif model.config.model_type in AWQ_FUSED_MAPPINGS:
+        current_fused_mapping = AWQ_FUSED_MAPPINGS[model.config.model_type]
+
+        # Handle hidden_size, num_attention_heads, num_key_value_heads on our own.
+        hidden_size = model.config.hidden_size
+        num_attention_heads = model.config.num_attention_heads
+        num_key_value_heads = getattr(model.config, "num_key_value_heads", num_attention_heads)
+
+        # Fill `current_fused_mapping` with the expected values
+        current_fused_mapping["hidden_size"] = hidden_size
+        current_fused_mapping["num_attention_heads"] = num_attention_heads
+        current_fused_mapping["num_key_value_heads"] = num_key_value_heads
+        current_fused_mapping["max_seq_len"] = quantization_config.fuse_max_seq_len
+    else:
+        raise ValueError(
+            "Fusing mapping not found either on the quantization config or the supported `AWQ_FUSED_MAPPINGS`. Please pass a `fused_mapping` argument"
+            " in the `quantization_config` or raise an issue on transformers https://github.com/huggingface/transformers to add its support."
+        )
+    return current_fused_mapping
+
+
+def fuse_awq_modules(model, quantization_config):
+    """
+    Optionally fuse some modules in the model to speedup inference.
+
+    Args:
+        model (`~PreTrainedModel`):
+            The model to fuse - note this model should have been converted into AWQ format beforehand.
+        quantization_config (`dict`):
+            The quantization configuration to use.
+    """
+    # We need to convert it from dict in order to get an AwqConfig object
+    # otherwise the fields `backend` etc. will not be available
+    # https://github.com/huggingface/transformers/pull/27411#discussion_r1414044495
+    awq_config = AwqConfig.from_dict(quantization_config)
+    backend = awq_config.backend
+
+    modules_to_fuse = get_modules_to_fuse(model, awq_config)
+
+    if backend == AwqBackendPackingMethod.AUTOAWQ:
+        from awq.modules.fused.attn import QuantAttentionFused
+        from awq.modules.fused.mlp import QuantFusedMLP
+        from awq.modules.fused.norm import FasterTransformerRMSNorm
+    else:
+        raise ValueError("Fusing is only supported for the AutoAWQ backend")
+
+    for name, module in model.named_modules():
+        # Replace layer norms
+        _fuse_awq_layernorm(modules_to_fuse["layernorm"], module, FasterTransformerRMSNorm)
+
+        # Replace MLP layers
+        _fuse_awq_mlp(model, name, modules_to_fuse["mlp"], module, QuantFusedMLP)
+
+        # Replace attention layers
+        _fuse_awq_attention_layers(model, module, modules_to_fuse, name, QuantAttentionFused)
+    return model
+
+
+def _fuse_awq_layernorm(fuse_module_names, module, target_cls):
+    """
+    Fuse the LayerNorm layers into a target class using autoawq
+
+    Args:
+        fuse_module_names (`List[str]`):
+            The list of module names to fuse
+        module (`nn.Module`):
+            The pytorch parent module that has layernorm modules to fuse
+        target_cls (`~autoawq.FasterTransformerRMSNorm`):
+            The `FasterTransformerRMSNorm` class as it only supports that class
+            for now.
+    """
+    for module_name in fuse_module_names:
+        if hasattr(module, module_name):
+            old_module = getattr(module, module_name)
+            module._modules[module_name] = target_cls(
+                old_module.weight,
+                old_module.variance_epsilon,
+            ).to(old_module.weight.device)
+            del old_module
+
+
+def _fuse_awq_mlp(model, current_module_name, fuse_module_names, module, target_cls):
+    """
+    Fuse the MLP layers into a target class using autoawq
+
+    Args:
+        model (`~PreTrainedModel`):
+            The input pretrained model
+        current_module_name (`str`):
+            The current submodule name
+        fuse_module_names (`List[str]`):
+            The list of module names to fuse. For the MLP layers it has to be an array
+            of length 3 that consists of the 3 MLP layers in the order (gate (dense layer post-attention) / up / down layers)
+        module (`nn.Module`):
+            The pytorch parent module that has layernorm modules to fuse
+        target_cls (`~autoawq.QuantFusedMLP`):
+            The `QuantFusedMLP` class as it only supports that class
+            for now.
+    """
+    if len(fuse_module_names) == 0:
+        return
+
+    if hasattr(module, fuse_module_names[0]):
+        gate_proj = getattr(module, fuse_module_names[0])
+        up_proj = getattr(module, fuse_module_names[1])
+        down_proj = getattr(module, fuse_module_names[2])
+
+        previous_device = gate_proj.qweight.device
+        activation_fn = ACT2FN[model.config.hidden_act]
+        new_module = target_cls(gate_proj, down_proj, up_proj, activation_fn)
+
+        parent_name, child_name = current_module_name.rsplit(".", 1)
+        parent = model.get_submodule(parent_name)
+        setattr(parent, child_name, new_module.to(previous_device))
+
+        del gate_proj, up_proj, down_proj
+
+
+def _fuse_awq_attention_layers(model, module, modules_to_fuse, current_module_name, target_cls):
+    """
+    Fuse the Attention layers into a target class using autoawq
+
+    Args:
+        model (`~PreTrainedModel`):
+            The input pretrained model
+        module (`nn.Module`):
+            The pytorch parent module that has layernorm modules to fuse
+        modules_to_fuse (`List[str]`):
+            The module fusing mapping. The dictionary has to contain a field `attention` with attention module names
+            in the correct order: q, k, v, o layer
+        current_module_name (`str`):
+            The current submodule name
+        target_cls (`~autoawq.QuantAttentionFused`):
+            The `QuantAttentionFused` class as it only supports that class
+            for now.
+    """
+    from awq.modules.linear import WQLinear_GEMM, WQLinear_GEMV
+
+    if len(modules_to_fuse["attention"]) == 0:
+        return
+
+    if hasattr(module, modules_to_fuse["attention"][0]):
+        # First, we pack the QKV layers together
+        q_proj = getattr(module, modules_to_fuse["attention"][0])
+        previous_device = q_proj.qweight.device
+
+        if isinstance(q_proj, WQLinear_GEMV):
+            linear_target_cls = WQLinear_GEMV
+            cat_dim = 0
+        elif isinstance(q_proj, WQLinear_GEMM):
+            linear_target_cls = WQLinear_GEMM
+            cat_dim = 1
+        else:
+            raise ValueError("Unsupported q_proj type: {type(q_proj)}")
+
+        k_proj = getattr(module, modules_to_fuse["attention"][1])
+        v_proj = getattr(module, modules_to_fuse["attention"][2])
+        o_proj = getattr(module, modules_to_fuse["attention"][3])
+
+        bias = torch.cat([q_proj.bias, k_proj.bias, v_proj.bias], dim=0) if q_proj.bias is not None else None
+
+        qkv_layer = linear_target_cls(
+            q_proj.w_bit,
+            q_proj.group_size,
+            q_proj.in_features,
+            q_proj.out_features + k_proj.out_features + v_proj.out_features,
+            q_proj.bias is not None,
+            next(iter(module.state_dict().values())).device,
+        )
+
+        qkv_layer.qweight = torch.cat([q_proj.qweight, k_proj.qweight, v_proj.qweight], dim=cat_dim)
+        qkv_layer.qzeros = torch.cat([q_proj.qzeros, k_proj.qzeros, v_proj.qzeros], dim=cat_dim)
+        qkv_layer.scales = torch.cat([q_proj.scales, k_proj.scales, v_proj.scales], dim=cat_dim)
+
+        if isinstance(qkv_layer, WQLinear_GEMV):
+            qkv_layer.split_k_iters = q_proj.split_k_iters
+
+        qkv_layer.bias = bias
+
+        fused_attention_layer = target_cls(
+            modules_to_fuse["hidden_size"],
+            modules_to_fuse["num_attention_heads"],
+            modules_to_fuse["num_key_value_heads"],
+            qkv_layer,
+            o_proj,
+            previous_device,
+            modules_to_fuse["max_seq_len"],
+            use_alibi=modules_to_fuse["use_alibi"],
+        )
+
+        fused_attention_layer.is_hf_transformers = True
+
+        parent_name, child_name = current_module_name.rsplit(".", 1)
+        parent = model.get_submodule(parent_name)
+        setattr(parent, child_name, fused_attention_layer.to(previous_device))
+
+        del q_proj, k_proj, v_proj, o_proj
diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py
index 5bddb24ed326c6..145a3b25289f1a 100644
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@@ -236,8 +236,9 @@ def _objective(trial, checkpoint_dir=None):
 
 def run_hp_search_ray(trainer, n_trials: int, direction: str, **kwargs) -> BestRun:
     import ray
+    import ray.train
 
-    def _objective(trial, local_trainer, checkpoint_dir=None):
+    def _objective(trial: dict, local_trainer):
         try:
             from transformers.utils.notebook import NotebookProgressCallback
 
@@ -246,19 +247,34 @@ def _objective(trial, local_trainer, checkpoint_dir=None):
         except ModuleNotFoundError:
             pass
 
-        checkpoint = None
-        if checkpoint_dir:
-            for subdir in os.listdir(checkpoint_dir):
-                if subdir.startswith(PREFIX_CHECKPOINT_DIR):
-                    checkpoint = os.path.join(checkpoint_dir, subdir)
         local_trainer.objective = None
-        local_trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
+
+        checkpoint = ray.train.get_checkpoint()
+        if checkpoint:
+            # Upon trial resume, the local_trainer's objective gets reset to None.
+            # If `local_trainer.train` is a noop (training has already reached
+            # the target number of epochs/steps), then this would
+            # trigger an unnecessary extra checkpoint at the end of training.
+            # -> Set the objective to a dummy value upon resume as a workaround.
+            local_trainer.objective = "objective"
+
+            with checkpoint.as_directory() as checkpoint_dir:
+                checkpoint_path = next(Path(checkpoint_dir).glob(f"{PREFIX_CHECKPOINT_DIR}*")).as_posix()
+                local_trainer.train(resume_from_checkpoint=checkpoint_path, trial=trial)
+        else:
+            local_trainer.train(trial=trial)
+
         # If there hasn't been any evaluation during the training loop.
         if getattr(local_trainer, "objective", None) is None:
             metrics = local_trainer.evaluate()
             local_trainer.objective = local_trainer.compute_objective(metrics)
-            local_trainer._tune_save_checkpoint()
-            ray.tune.report(objective=local_trainer.objective, **metrics, done=True)
+
+            metrics.update({"objective": local_trainer.objective, "done": True})
+
+            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                local_trainer._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir)
+                checkpoint = ray.train.Checkpoint.from_directory(temp_checkpoint_dir)
+                ray.train.report(metrics, checkpoint=checkpoint)
 
     if not trainer._memory_tracker.skip_memory_metrics:
         from ..trainer_utils import TrainerMemoryTracker
@@ -296,28 +312,10 @@ def _objective(trial, local_trainer, checkpoint_dir=None):
         from ray.tune import CLIReporter
 
         kwargs["progress_reporter"] = CLIReporter(metric_columns=["objective"])
-    if "keep_checkpoints_num" in kwargs and kwargs["keep_checkpoints_num"] > 0:
-        # `keep_checkpoints_num=0` would disabled checkpointing
-        trainer.use_tune_checkpoints = True
-        if kwargs["keep_checkpoints_num"] > 1:
-            logger.warning(
-                f"Currently keeping {kwargs['keep_checkpoints_num']} checkpoints for each trial. "
-                "Checkpoints are usually huge, "
-                "consider setting `keep_checkpoints_num=1`."
-            )
+
     if "scheduler" in kwargs:
         from ray.tune.schedulers import ASHAScheduler, HyperBandForBOHB, MedianStoppingRule, PopulationBasedTraining
 
-        # Check if checkpointing is enabled for PopulationBasedTraining
-        if isinstance(kwargs["scheduler"], PopulationBasedTraining):
-            if not trainer.use_tune_checkpoints:
-                logger.warning(
-                    "You are using PopulationBasedTraining but you haven't enabled checkpointing. "
-                    "This means your trials will train from scratch everytime they are exploiting "
-                    "new configurations. Consider enabling checkpointing by passing "
-                    "`keep_checkpoints_num=1` as an additional argument to `Trainer.hyperparameter_search`."
-                )
-
         # Check for `do_eval` and `eval_during_training` for schedulers that require intermediate reporting.
         if isinstance(
             kwargs["scheduler"], (ASHAScheduler, MedianStoppingRule, HyperBandForBOHB, PopulationBasedTraining)
@@ -1660,7 +1658,7 @@ def setup(self, args, state, model):
         """
         from dvclive import Live
 
-        self._initalized = True
+        self._initialized = True
         if self._log_model is not None:
             log_model_env = os.getenv("HF_DVCLIVE_LOG_MODEL")
             if log_model_env.upper() in ENV_VARS_TRUE_VALUES:
@@ -1680,10 +1678,19 @@ def on_log(self, args, state, control, model=None, logs=None, **kwargs):
         if not self._initialized:
             self.setup(args, state, model)
         if state.is_world_process_zero:
+            from dvclive.plots import Metric
             from dvclive.utils import standardize_metric_name
 
             for key, value in logs.items():
-                self.live.log_metric(standardize_metric_name(key, "dvclive.huggingface"), value)
+                if Metric.could_log(value):
+                    self.live.log_metric(standardize_metric_name(key, "dvclive.huggingface"), value)
+                else:
+                    logger.warning(
+                        "Trainer is attempting to log a value of "
+                        f'"{value}" of type {type(value)} for key "{key}" as a scalar. '
+                        "This invocation of DVCLive's Live.log_metric() "
+                        "is incorrect so we dropped this attribute."
+                    )
             self.live.next_step()
 
     def on_save(self, args, state, control, **kwargs):
diff --git a/src/transformers/modeling_attn_mask_utils.py b/src/transformers/modeling_attn_mask_utils.py
index 9658adc55d5c96..734f443e1fc9d4 100755
--- a/src/transformers/modeling_attn_mask_utils.py
+++ b/src/transformers/modeling_attn_mask_utils.py
@@ -33,7 +33,7 @@ class AttentionMaskConverter:
     >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter
 
     >>> converter = AttentionMaskConverter(True)
-    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, 5)
+    >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32)
     tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
             [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
             [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38],
@@ -66,9 +66,9 @@ def to_causal_4d(
         batch_size: int,
         query_length: int,
         key_value_length: int,
-        dtype: torch.dtype = torch.float32,
+        dtype: torch.dtype,
         device: Union[torch.device, "str"] = "cpu",
-    ) -> torch.Tensor:
+    ) -> Optional[torch.Tensor]:
         """
         Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative
         bias to upper right hand triangular matrix (causal mask).
@@ -98,8 +98,8 @@ def to_4d(
         self,
         attention_mask_2d: torch.Tensor,
         query_length: int,
+        dtype: torch.dtype,
         key_value_length: Optional[int] = None,
-        dtype: torch.dtype = torch.float32,
     ) -> torch.Tensor:
         """
         Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length,
@@ -184,6 +184,95 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 
         return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min)
 
+    @staticmethod
+    def _unmask_unattended(
+        expanded_mask: torch.Tensor, attention_mask: torch.Tensor, unmasked_value: Union[bool, float]
+    ):
+        # fmt: off
+        """
+        Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when
+        using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+        Details: https://github.com/pytorch/pytorch/issues/110213
+
+        `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len].
+        `attention_mask` is [bsz, src_seq_len].
+
+        The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias.
+
+        For example, if `attention_mask` is
+        ```
+        [[0, 0, 1],
+         [1, 1, 1],
+         [0, 1, 1]]
+        ```
+        and `expanded_mask` is (e.g. here left-padding case)
+        ```
+        [[[[0, 0, 0],
+           [0, 0, 0],
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[0, 0, 0],
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        then the modified `expanded_mask` will be
+        ```
+        [[[[1, 1, 1],   <-- modified
+           [1, 1, 1],   <-- modified
+           [0, 0, 1]]],
+         [[[1, 0, 0],
+           [1, 1, 0],
+           [1, 1, 1]]],
+         [[[1, 1, 1],   <-- modified
+           [0, 1, 0],
+           [0, 1, 1]]]]
+        ```
+        """
+        # fmt: on
+
+        # Get the index of the first non-zero value for every sample in the batch.
+        # In the above example, indices = [[2], [0], [1]]]
+        tmp = torch.arange(attention_mask.shape[1], 0, -1)
+        indices = torch.argmax(attention_mask.cpu() * tmp, 1, keepdim=True)
+
+        # Find the batch indexes that have unattended tokens on the leftmost side (e.g. [0, 0, 1, 1, 1]), for which the first rows of the
+        # expanded mask will be completely unattended.
+        left_masked_rows = torch.where(indices > 0)[0]
+
+        if left_masked_rows.shape[0] == 0:
+            return expanded_mask
+        indices = indices[left_masked_rows]
+
+        max_len = torch.max(indices)
+        range_tensor = torch.arange(max_len).unsqueeze(0)
+        range_tensor = range_tensor.repeat(indices.size(0), 1)
+
+        # Avoid unmasking tokens at relevant target positions (on the row axis), by rather unmasking possibly several times the first row that should always be unmasked as we filtered out the batch above.
+        range_tensor[range_tensor >= indices] = 0
+
+        # TODO: we may drop support for 3D attention mask as the refactor from Patrick maybe dropped this case
+        if expanded_mask.dim() == 4:
+            num_masks = expanded_mask.shape[1]
+            if num_masks == 1:
+                # Broadcast [left_masked_rows, 1], [left_masked_rows, max_len]
+                mask_slice = (left_masked_rows[:, None], 0, range_tensor)
+            else:
+                # Broadcast [left_masked_rows, 1, 1], [1, num_masks, 1], [left_masked_rows, 1, max_len]
+                mask_slice = (
+                    left_masked_rows[:, None, None],
+                    torch.arange(num_masks)[None, :, None],
+                    range_tensor[:, None, :],
+                )
+        else:
+            # Broadcast [left_masked_rows, 1], [left_masked_rows, max_len]
+            mask_slice = (left_masked_rows[:, None], range_tensor)
+
+        expanded_mask[mask_slice] = unmasked_value
+
+        return expanded_mask
+
 
 def _prepare_4d_causal_attention_mask(
     attention_mask: Optional[torch.Tensor],
@@ -215,7 +304,7 @@ def _prepare_4d_causal_attention_mask(
     # 4d mask is passed through the layers
     if attention_mask is not None:
         attention_mask = attn_mask_converter.to_4d(
-            attention_mask, input_shape[-1], key_value_length, dtype=inputs_embeds.dtype
+            attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype
         )
     else:
         attention_mask = attn_mask_converter.to_causal_4d(
@@ -225,6 +314,78 @@ def _prepare_4d_causal_attention_mask(
     return attention_mask
 
 
+# Adapted from _prepare_4d_causal_attention_mask
+def _prepare_4d_causal_attention_mask_for_sdpa(
+    attention_mask: Optional[torch.Tensor],
+    input_shape: Union[torch.Size, Tuple, List],
+    inputs_embeds: torch.Tensor,
+    past_key_values_length: int,
+    sliding_window: Optional[int] = None,
+):
+    """
+    Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`.
+
+    In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and
+    `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks,
+    allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed).
+    """
+    attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window)
+
+    key_value_length = input_shape[-1] + past_key_values_length
+    batch_size, query_length = input_shape
+
+    # torch.jit.trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
+    # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
+    # TODO: Fix this as well when using torchdynamo with fullgraph=True.
+    is_tracing = torch.jit.is_tracing()
+
+    if attention_mask is not None:
+        if torch.all(attention_mask == 1):
+            if is_tracing:
+                pass
+            elif query_length == 1:
+                # For query_length == 1, causal attention and bi-directional attention are the same.
+                attention_mask = None
+            elif key_value_length == query_length:
+                attention_mask = None
+            else:
+                # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore the attention mask, as SDPA causal mask generation
+                # may be wrong. We will set `is_causal=False` in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+                # Reference: https://github.com/pytorch/pytorch/issues/108108
+                pass
+    elif query_length > 1 and key_value_length != query_length:
+        # See the comment above (https://github.com/pytorch/pytorch/issues/108108).
+        # Ugly: we set it to True here to dispatch in the following controlflow to `to_causal_4d`.
+        attention_mask = True
+    elif is_tracing:
+        raise ValueError(
+            'Attention using SDPA can not be traced with torch.jit.trace when no attention_mask is provided. To solve this issue, please either load your model with the argument `attn_implementation="eager"` or pass an attention_mask input when tracing the model.'
+        )
+
+    if attention_mask is None:
+        expanded_4d_mask = None
+    elif attention_mask is True:
+        expanded_4d_mask = attn_mask_converter.to_causal_4d(
+            input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+    else:
+        expanded_4d_mask = attn_mask_converter.to_4d(
+            attention_mask,
+            input_shape[-1],
+            dtype=inputs_embeds.dtype,
+            key_value_length=key_value_length,
+        )
+
+        # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+        # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+        if query_length > 1:
+            expanded_4d_mask = AttentionMaskConverter._unmask_unattended(
+                expanded_4d_mask, attention_mask, unmasked_value=0.0
+            )
+
+    return expanded_4d_mask
+
+
 def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     """
     Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
@@ -241,13 +402,51 @@ def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len:
     return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
 
 
+def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
+    """
+    Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`
+
+    Args:
+        mask (`torch.Tensor` or `None`):
+            A 2D attention mask of shape `(batch_size, key_value_length)`
+        dtype (`torch.dtype`):
+            The torch dtype the created mask shall have.
+        tgt_len (`int`):
+            The target length or query length the created mask shall have.
+    """
+    batch_size, key_value_length = mask.shape
+    tgt_len = tgt_len if tgt_len is not None else key_value_length
+
+    # torch.jit.trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1`
+    # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing.
+    # TODO: Fix this as well when using torchdynamo with fullgraph=True.
+    is_tracing = torch.jit.is_tracing()
+
+    if torch.all(mask == 1):
+        if is_tracing:
+            pass
+        elif tgt_len == 1:
+            # For query_length == 1, causal attention and bi-directional attention are the same.
+            return None
+        elif key_value_length == tgt_len:
+            return None
+        else:
+            # Unfortunately, for query_length > 1 and key_value_length != query_length, we can not generally ignore the attention mask, as SDPA causal mask generation
+            # may be wrong. We will set is_causal=False in SDPA and rely on Transformers attention_mask instead, hence not setting it to None here.
+            # Reference: https://github.com/pytorch/pytorch/issues/108108
+            return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+    else:
+        return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+
+
 def _create_4d_causal_attention_mask(
     input_shape: Union[torch.Size, Tuple, List],
     dtype: torch.dtype,
     device: torch.device,
     past_key_values_length: int = 0,
     sliding_window: Optional[int] = None,
-):
+) -> Optional[torch.Tensor]:
     """
     Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)`
 
diff --git a/src/transformers/modeling_flax_utils.py b/src/transformers/modeling_flax_utils.py
index d4617c111bb2a1..eb14216c5cd994 100644
--- a/src/transformers/modeling_flax_utils.py
+++ b/src/transformers/modeling_flax_utils.py
@@ -915,7 +915,7 @@ def from_pretrained(
                 state = jax.tree_util.tree_map(jnp.array, state)
             else:
                 # keep the params on CPU if we don't want to initialize
-                state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.devices("cpu")[0]), state)
+                state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.local_devices(backend="cpu")[0]), state)
 
         if "batch_stats" in state:  # if flax model contains batch norm layers
             # if model is base model only use model_prefix key
@@ -1267,13 +1267,17 @@ def overwrite_call_docstring(model_class, docstring):
     model_class.__call__ = add_start_docstrings_to_model_forward(docstring)(model_class.__call__)
 
 
-def append_call_sample_docstring(model_class, checkpoint, output_type, config_class, mask=None):
+def append_call_sample_docstring(
+    model_class, checkpoint, output_type, config_class, mask=None, revision=None, real_checkpoint=None
+):
     model_class.__call__ = copy_func(model_class.__call__)
     model_class.__call__ = add_code_sample_docstrings(
         checkpoint=checkpoint,
         output_type=output_type,
         config_class=config_class,
         model_cls=model_class.__name__,
+        revision=revision,
+        real_checkpoint=real_checkpoint,
     )(model_class.__call__)
 
 
diff --git a/src/transformers/modeling_outputs.py b/src/transformers/modeling_outputs.py
index aceec7abd40643..cbee6a292b531b 100755
--- a/src/transformers/modeling_outputs.py
+++ b/src/transformers/modeling_outputs.py
@@ -368,6 +368,97 @@ class MoEModelOutput(ModelOutput):
     router_probs: Optional[Tuple[torch.FloatTensor]] = None
 
 
+@dataclass
+class MoeModelOutputWithPast(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
+            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
+            encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+            input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    router_logits: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class MoeCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for causal language model (or autoregressive) with mixture of experts outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+
+        aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
+            aux_loss for the sparse modules.
+
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+
+            Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary
+            loss for Mixture of Experts models.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    aux_loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    router_logits: Optional[Tuple[torch.FloatTensor]] = None
+
+
 @dataclass
 class MoEModelOutputWithPastAndCrossAttentions(ModelOutput):
     """
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index d45b95fa5bc70f..c599b795bf1932 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -318,7 +318,14 @@ def load_pytorch_state_dict_in_tf2_model(
             name_scope=_prefix,
         )
         if tf_to_pt_weight_rename is not None:
-            name = tf_to_pt_weight_rename(name)
+            aliases = tf_to_pt_weight_rename(name)  # Is a tuple to account for possible name aliasing
+            for alias in aliases:  # The aliases are in priority order, take the first one that matches
+                if alias in tf_keys_to_pt_keys:
+                    name = alias
+                    break
+            else:
+                # If none of the aliases match, just use the first one (it'll be reported as missing)
+                name = aliases[0]
 
         # Find associated numpy array in pytorch model state dict
         if name not in tf_keys_to_pt_keys:
diff --git a/src/transformers/modeling_tf_utils.py b/src/transformers/modeling_tf_utils.py
index bfd928a9011fe2..00fe790252bf62 100644
--- a/src/transformers/modeling_tf_utils.py
+++ b/src/transformers/modeling_tf_utils.py
@@ -2892,6 +2892,12 @@ def from_pretrained(
         # Instantiate model.
         model = cls(config, *model_args, **model_kwargs)
 
+        if tf_to_pt_weight_rename is None and hasattr(model, "tf_to_pt_weight_rename"):
+            # TODO Matt: This is a temporary workaround to allow weight renaming, but requires a method
+            #            to be defined for each class that requires a rename. We can probably just have a class-level
+            #            dict and a single top-level method or something and cut down a lot of boilerplate code
+            tf_to_pt_weight_rename = model.tf_to_pt_weight_rename
+
         if from_pt:
             from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py
index d7e0580e43592d..3247c323685815 100644
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import collections
+import copy
 import functools
 import gc
 import importlib.metadata
@@ -49,6 +50,7 @@
     prune_layer,
     prune_linear_layer,
 )
+from .safetensors_conversion import auto_conversion
 from .utils import (
     ADAPTER_SAFE_WEIGHTS_NAME,
     ADAPTER_WEIGHTS_NAME,
@@ -79,6 +81,7 @@
     is_peft_available,
     is_remote_url,
     is_safetensors_available,
+    is_torch_sdpa_available,
     is_torch_tpu_available,
     logging,
     replace_return_docstrings,
@@ -132,8 +135,12 @@ def is_fsdp_enabled():
     )
 
 
-def is_fsdp_enabled_and_dist_rank_0():
-    return is_fsdp_enabled() and int(os.environ.get("LOCAL_RANK", -1)) == 0
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", -1)) == 0
+    )
 
 
 if is_sagemaker_mp_enabled():
@@ -147,6 +154,23 @@ def is_fsdp_enabled_and_dist_rank_0():
 if is_peft_available():
     from .utils import find_adapter_config_file
 
+TORCH_INIT_FUNCTIONS = {
+    "uniform_": nn.init.uniform_,
+    "normal_": nn.init.normal_,
+    "trunc_normal_": nn.init.trunc_normal_,
+    "constant_": nn.init.constant_,
+    "xavier_uniform_": nn.init.xavier_uniform_,
+    "xavier_normal_": nn.init.xavier_normal_,
+    "kaiming_uniform_": nn.init.kaiming_uniform_,
+    "kaiming_normal_": nn.init.kaiming_normal_,
+    "uniform": nn.init.uniform,
+    "normal": nn.init.normal,
+    "xavier_uniform": nn.init.xavier_uniform,
+    "xavier_normal": nn.init.xavier_normal,
+    "kaiming_uniform": nn.init.kaiming_uniform,
+    "kaiming_normal": nn.init.kaiming_normal,
+}
+
 
 @contextmanager
 def no_init_weights(_enable=True):
@@ -157,12 +181,24 @@ def no_init_weights(_enable=True):
     """
     global _init_weights
     old_init_weights = _init_weights
+
     if _enable:
         _init_weights = False
+
+        def _skip_init(*args, **kwargs):
+            pass
+
+        # # Save the original initialization functions
+        for name, init_func in TORCH_INIT_FUNCTIONS.items():
+            setattr(torch.nn.init, name, _skip_init)
     try:
         yield
     finally:
         _init_weights = old_init_weights
+        if _enable:
+            # # Restore the original initialization functions
+            for name, init_func in TORCH_INIT_FUNCTIONS.items():
+                setattr(torch.nn.init, name, init_func)
 
 
 def get_parameter_device(parameter: Union[nn.Module, GenerationMixin, "ModuleUtilsMixin"]):
@@ -474,13 +510,12 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike]):
         return safe_load_file(checkpoint_file)
     try:
         if (
-            (is_deepspeed_zero3_enabled() or is_fsdp_enabled())
-            and torch.distributed.is_initialized()
-            and torch.distributed.get_rank() > 0
-        ):
+            is_deepspeed_zero3_enabled() and torch.distributed.is_initialized() and torch.distributed.get_rank() > 0
+        ) or (is_fsdp_enabled() and not is_local_dist_rank_0()):
             map_location = "meta"
         else:
             map_location = "cpu"
+
         return torch.load(checkpoint_file, map_location=map_location)
     except Exception as e:
         try:
@@ -1006,7 +1041,7 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
             else:
                 raise ValueError(
                     "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong"
-                    " make sure to install bitsandbytes with `pip install bitsandbytes`."
+                    " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. "
                 )
 
         for param in total_parameters:
@@ -1123,6 +1158,12 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
     # Flash Attention 2 support
     _supports_flash_attn_2 = False
 
+    # SDPA support
+    _supports_sdpa = False
+
+    # Has support for a `Cache` instance as `past_key_values`
+    _supports_cache_class = False
+
     @property
     def dummy_inputs(self) -> Dict[str, torch.Tensor]:
         """
@@ -1146,10 +1187,18 @@ def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
                 f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`"
             )
         # Save config and origin of the pretrained weights if given in model
+        config = self._autoset_attn_implementation(
+            config, torch_dtype=torch.get_default_dtype(), check_device_map=False
+        )
         self.config = config
+
         self.name_or_path = config.name_or_path
         self.warnings_issued = {}
         self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None
+        # Overwrite the class attribute to make it an instance attribute, so models like
+        # `InstructBlipForConditionalGeneration` can dynamically update it without modifying the class attribute
+        # when a different component (e.g. language_model) is used.
+        self._keep_in_fp32_modules = copy.copy(self.__class__._keep_in_fp32_modules)
 
     def post_init(self):
         """
@@ -1175,12 +1224,19 @@ def _from_config(cls, config, **kwargs):
                 Override the default `torch.dtype` and load the model under this dtype.
         """
         torch_dtype = kwargs.pop("torch_dtype", None)
+        use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False)
 
         # override default dtype if needed
         dtype_orig = None
         if torch_dtype is not None:
             dtype_orig = cls._set_default_torch_dtype(torch_dtype)
 
+        config = copy.deepcopy(config)  # We do not want to modify the config inplace in _from_config.
+        config._attn_implementation = kwargs.pop("attn_implementation", None)
+        config = cls._autoset_attn_implementation(
+            config, use_flash_attention_2=use_flash_attention_2, check_device_map=False
+        )
+
         if is_deepspeed_zero3_enabled():
             import deepspeed
 
@@ -1198,6 +1254,68 @@ def _from_config(cls, config, **kwargs):
 
         return model
 
+    @classmethod
+    def _autoset_attn_implementation(
+        cls,
+        config,
+        use_flash_attention_2: bool = False,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        check_device_map: bool = True,
+    ):
+        """
+        Automatically checks and dispatches to a default attention implementation. In order of priority:
+            1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained).
+            2. DEPRECATED: if use_flash_attention_2 is set to `True` and `flash_attn` is available, flash attention. (`LlamaFlashAttention` for example)
+            3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example)
+            4. The default model's implementation otherwise (`LlamaAttention` for example) .
+        """
+        # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitely set by the user.
+        # The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager").
+        # The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model)
+        requested_attn_implementation = None
+        if hasattr(config, "_attn_implementation_internal") and config._attn_implementation_internal is not None:
+            if config._attn_implementation != "flash_attention_2" and use_flash_attention_2:
+                raise ValueError(
+                    f'Both attn_implementation="{config._attn_implementation}" and `use_flash_attention_2=True` were used when loading the model, which are not compatible.'
+                    ' We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.'
+                )
+
+            if config._attn_implementation not in ["eager", "sdpa", "flash_attention_2"]:
+                message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)'
+                if cls._supports_flash_attn_2:
+                    message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)'
+                if cls._supports_sdpa:
+                    message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)'
+                raise ValueError(message + ".")
+
+            # If a config is passed with a preset attn_implementation, we skip the automatic dispatch and use the user-provided config, with hard checks that the requested attention implementation is available.
+            requested_attn_implementation = config._attn_implementation_internal
+
+        if use_flash_attention_2:
+            logger.warning_once(
+                'The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.'
+            )
+            config._attn_implementation = "flash_attention_2"
+
+        if config._attn_implementation == "flash_attention_2":
+            cls._check_and_enable_flash_attn_2(
+                config,
+                torch_dtype=torch_dtype,
+                device_map=device_map,
+                hard_check_only=False,
+                check_device_map=check_device_map,
+            )
+        elif requested_attn_implementation in [None, "sdpa"]:
+            # use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif.
+            config = cls._check_and_enable_sdpa(
+                config, hard_check_only=False if requested_attn_implementation is None else True
+            )
+        else:
+            config._attn_implementation = "eager"
+
+        return config
+
     @classmethod
     def _set_default_torch_dtype(cls, dtype: torch.dtype) -> torch.dtype:
         """
@@ -1248,40 +1366,46 @@ def can_generate(cls) -> bool:
 
     @classmethod
     def _check_and_enable_flash_attn_2(
-        cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, Dict[str, int]]] = None
+        cls,
+        config,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        check_device_map: bool = True,
+        hard_check_only: bool = False,
     ) -> PretrainedConfig:
         """
-        If you don't know about Flash Attention, check out the official repository of flash attention:
-        https://github.com/Dao-AILab/flash-attention
+        Checks the availability of Flash Attention 2 and compatibility with the current model.
 
-        For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this
-        specific section of the documentation to learn more about it:
-        https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models
-
-        The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
-        half precision and not ran on CPU.
-
-        If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that the model
-        can initialize the correct attention module
+        If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
         """
         if not cls._supports_flash_attn_2:
             raise ValueError(
-                "The current architecture does not support Flash Attention 2.0. Please open an issue on GitHub to "
+                f"{cls.__name__} does not support Flash Attention 2.0 yet. Please open an issue on GitHub to "
                 "request support for this architecture: https://github.com/huggingface/transformers/issues/new"
             )
 
         if not is_flash_attn_2_available():
-            raise ImportError(
-                "Flash Attention 2 is not available. Please refer to the documentation of https://github.com/Dao-AILab/flash-attention for"
-                " installing it. Make sure to have at least the version 2.1.0"
-            )
-        else:
+            preface = "FlashAttention2 has been toggled on, but it cannot be used due to the following error:"
+            install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2."
+
+            if importlib.util.find_spec("flash_attn") is None:
+                raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}")
+
             flash_attention_version = version.parse(importlib.metadata.version("flash_attn"))
-            is_flash_greater_than_2 = flash_attention_version >= version.parse("2.1.0")
-            if not is_flash_greater_than_2:
-                raise ValueError(
-                    f"You need flash_attn package version to be greater or equal than 2.1. Make sure to have that version installed - detected version {flash_attention_version}"
-                )
+            if torch.version.cuda:
+                if flash_attention_version < version.parse("2.1.0"):
+                    raise ImportError(
+                        f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}"
+                    )
+                else:
+                    raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
+            elif torch.version.hip:
+                if flash_attention_version < version.parse("2.0.4"):
+                    raise ImportError(
+                        f"{preface} you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version {flash_attention_version}. {install_message}"
+                    )
+                else:
+                    raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}")
 
         _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
 
@@ -1300,20 +1424,23 @@ def _check_and_enable_flash_attn_2(
                 " unexpected behaviour."
             )
 
-        if device_map is None:
+        # The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called,
+        # or the model may be initialized under the context manager `with torch.device("cuda"):`.
+        if check_device_map and device_map is None and torch.empty(0).device.type != "cuda":
             if torch.cuda.is_available():
                 logger.warning(
-                    "You are attempting to use Flash Attention 2.0 with a model initialized on CPU. Make sure to move the model to GPU"
+                    "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU"
                     " after initializing it on CPU with `model.to('cuda')`."
                 )
             else:
                 raise ValueError(
-                    "You are attempting to use Flash Attention 2.0 with a model initialized on CPU and with no GPU available. "
+                    "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. "
                     "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map "
                     "or initialising the model on CPU and then moving it to GPU."
                 )
         elif (
-            device_map is not None
+            check_device_map
+            and device_map is not None
             and isinstance(device_map, dict)
             and ("cpu" in device_map.values() or "disk" in device_map.values())
         ):
@@ -1321,7 +1448,37 @@ def _check_and_enable_flash_attn_2(
                 "You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to "
                 "initialise the model on a GPU by passing a device_map that contains only GPU devices as keys."
             )
-        config._flash_attn_2_enabled = True
+        if not hard_check_only:
+            config._attn_implementation = "flash_attention_2"
+        return config
+
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
+        """
+        Checks the availability of SDPA for a given model.
+
+        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module.
+        """
+        if hard_check_only:
+            if not cls._supports_sdpa:
+                raise ValueError(
+                    f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet. Please open an issue on GitHub to "
+                    "request support for this architecture: https://github.com/huggingface/transformers/issues/new"
+                )
+            if not is_torch_sdpa_available():
+                raise ImportError(
+                    "PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1."
+                )
+
+        if not is_torch_sdpa_available() or not cls._supports_sdpa:
+            return config
+
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+        if _is_bettertransformer:
+            return config
+
+        if not hard_check_only:
+            config._attn_implementation = "sdpa"
         return config
 
     def enable_input_require_grads(self):
@@ -1378,7 +1535,10 @@ def get_output_embeddings(self) -> nn.Module:
 
     def _init_weights(self, module):
         """
-        Initialize the weights. This method should be overridden by derived class.
+        Initialize the weights. This method should be overridden by derived class and is
+        the only initialization method that will be called when loading a checkpoint
+        using `from_pretrained`. Any attempt to initialize outside of this function
+        will be useless as the torch.nn.init function are all replaced with skip.
         """
         pass
 
@@ -1580,6 +1740,8 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
         if hasattr(old_embeddings, "_hf_hook"):
             hook = old_embeddings._hf_hook
             add_hook_to_module(new_embeddings, hook)
+        old_embeddings_requires_grad = old_embeddings.weight.requires_grad
+        new_embeddings.requires_grad_(old_embeddings_requires_grad)
         self.set_input_embeddings(new_embeddings)
 
         # Update new_num_tokens with the actual size of new_embeddings
@@ -1599,6 +1761,8 @@ def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None):
             if hasattr(old_lm_head, "_hf_hook"):
                 hook = old_lm_head._hf_hook
                 add_hook_to_module(new_lm_head, hook)
+            old_lm_head_requires_grad = old_lm_head.weight.requires_grad
+            new_lm_head.requires_grad_(old_lm_head_requires_grad)
             self.set_output_embeddings(new_lm_head)
 
         return self.get_input_embeddings()
@@ -1870,7 +2034,18 @@ def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None):
 
         gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs)
 
-        self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
+        # For old GC format (transformers < 4.35.0) for models that live on the Hub
+        # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+        _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
+
+        if not _is_using_old_format:
+            self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func)
+        else:
+            self.apply(partial(self._set_gradient_checkpointing, value=True))
+            logger.warn(
+                "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)."
+                "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model."
+            )
 
         if getattr(self, "_hf_peft_config_loaded", False):
             # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True
@@ -1909,7 +2084,17 @@ def gradient_checkpointing_disable(self):
         activations".
         """
         if self.supports_gradient_checkpointing:
-            self._set_gradient_checkpointing(enable=False)
+            # For old GC format (transformers < 4.35.0) for models that live on the Hub
+            # we will fall back to the overwritten `_set_gradient_checkpointing` methid
+            _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters
+            if not _is_using_old_format:
+                self._set_gradient_checkpointing(enable=False)
+            else:
+                logger.warn(
+                    "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)."
+                    "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model."
+                )
+                self.apply(partial(self._set_gradient_checkpointing, value=False))
 
         if getattr(self, "_hf_peft_config_loaded", False):
             self.disable_input_require_grads()
@@ -2022,6 +2207,9 @@ def save_pretrained(
                 "You are calling `save_pretrained` on a 4-bit converted model. This is currently not supported"
             )
 
+        if getattr(self, "_awq_is_fused", False):
+            raise ValueError("You cannot save an AWQ model that uses fused modules!")
+
         if "save_config" in kwargs:
             warnings.warn(
                 "`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead."
@@ -2677,18 +2865,12 @@ def from_pretrained(
                 )
 
         quantization_method_from_args = None
+
         if quantization_config is not None:
             quantization_method_from_args = getattr(
                 quantization_config, "quant_method", QuantizationMethod.BITS_AND_BYTES
             )
 
-            if quantization_method_from_args == QuantizationMethod.AWQ:
-                raise ValueError(
-                    "You cannot pass an `AwqConfig` when loading a model as you can only use AWQ models"
-                    " for inference. To quantize transformers models with AWQ algorithm, please refer to our"
-                    " quantization docs: https://huggingface.co/docs/transformers/main_classes/quantization "
-                )
-
         if quantization_config is None and (load_in_8bit or load_in_4bit):
             quantization_method_from_args = QuantizationMethod.BITS_AND_BYTES
             quantization_config, kwargs = BitsAndBytesConfig.from_dict(
@@ -2711,11 +2893,13 @@ def from_pretrained(
                 )
 
         if load_in_8bit or load_in_4bit:
+            if not torch.cuda.is_available():
+                raise RuntimeError("No GPU found. A GPU is needed for quantization.")
             if not (is_accelerate_available() and is_bitsandbytes_available()):
                 raise ImportError(
                     "Using `load_in_8bit=True` requires Accelerate: `pip install accelerate` and the latest version of"
                     " bitsandbytes `pip install -i https://test.pypi.org/simple/ bitsandbytes` or"
-                    " pip install bitsandbytes` "
+                    " `pip install bitsandbytes`."
                 )
 
             if torch_dtype is None:
@@ -2729,10 +2913,7 @@ def from_pretrained(
                 torch_dtype = torch.float16
 
             if device_map is None:
-                if torch.cuda.is_available():
-                    device_map = {"": torch.cuda.current_device()}
-                else:
-                    raise RuntimeError("No GPU found. A GPU is needed for quantization.")
+                device_map = {"": torch.cuda.current_device()}
                 logger.info(
                     "The device_map was not initialized. "
                     "Setting device_map to {'':torch.cuda.current_device()}. "
@@ -2782,21 +2963,36 @@ def from_pretrained(
             quantization_method_from_config = config.quantization_config.get(
                 "quant_method", QuantizationMethod.BITS_AND_BYTES
             )
+
+        if (
+            quantization_method_from_args is not None
+            and quantization_method_from_args == QuantizationMethod.AWQ
+            and quantization_method_from_config is None
+        ):
+            raise ValueError(
+                "You cannot quantize with AWQ a non-quantized model using transformers, please refer to the quantization documentation"
+                " to read more about how to quantize models with AWQ algorithm https://huggingface.co/docs/transformers/main_classes/quantization"
+            )
+
         if quantization_method_from_config is not None and quantization_method_from_args is not None:
             if quantization_method_from_config != quantization_method_from_args:
                 raise ValueError(
                     f"The model is already quantized with {quantization_method_from_config}. "
                     f"You can't quantize it again with {quantization_method_from_args}"
                 )
-        if quantization_method_from_config == QuantizationMethod.GPTQ and quantization_method_from_args is not None:
+
+        if (
+            quantization_method_from_config in (QuantizationMethod.GPTQ, QuantizationMethod.AWQ)
+            and quantization_method_from_args is not None
+        ):
             loading_attr_dict = quantization_config.get_loading_attributes()
             for attr, val in loading_attr_dict.items():
                 config.quantization_config[attr] = val
             quantization_method_from_args = None
             logger.warning(
-                "You passed `quantization_config` to `from_pretrained` but the model you're loading already has a "
-                "`quantization_config` attribute and has already quantized weights. However, loading attributes"
-                " (e.g. use_exllama, exllama_config, use_cuda_fp16, max_input_length) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
+                f"You passed `quantization_config` to `from_pretrained` but the model you're loading already has a "
+                f"`quantization_config` attribute and has already quantized weights. However, loading attributes"
+                f" (e.g. {list(loading_attr_dict.keys())}) will be overwritten with the one you passed to `from_pretrained`. The rest will be ignored."
             )
         if (
             quantization_method_from_args == QuantizationMethod.GPTQ
@@ -3054,9 +3250,18 @@ def from_pretrained(
                         if resolved_archive_file is not None:
                             is_sharded = True
                         elif use_safetensors:
-                            raise EnvironmentError(
-                                f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} and thus cannot be loaded with `safetensors`. Please make sure that the model has been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
-                            )
+                            if revision == "main":
+                                resolved_archive_file, revision, is_sharded = auto_conversion(
+                                    pretrained_model_name_or_path, **cached_file_kwargs
+                                )
+                            cached_file_kwargs["revision"] = revision
+                            if resolved_archive_file is None:
+                                raise EnvironmentError(
+                                    f"{pretrained_model_name_or_path} does not appear to have a file named"
+                                    f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} "
+                                    "and thus cannot be loaded with `safetensors`. Please make sure that the model has "
+                                    "been saved with `safe_serialization=True` or do not set `use_safetensors=True`."
+                                )
                         else:
                             # This repo has no safetensors file of any kind, we switch to PyTorch.
                             filename = _add_variant(WEIGHTS_NAME, variant)
@@ -3110,7 +3315,7 @@ def from_pretrained(
                     # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted
                     # to the original exception.
                     raise
-                except Exception:
+                except Exception as e:
                     # For any other exception, we throw a generic error.
                     raise EnvironmentError(
                         f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it"
@@ -3118,7 +3323,7 @@ def from_pretrained(
                         f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a"
                         f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)},"
                         f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}."
-                    )
+                    ) from e
 
             if is_local:
                 logger.info(f"loading weights file {archive_file}")
@@ -3235,10 +3440,13 @@ def from_pretrained(
         elif load_in_8bit or load_in_4bit or low_cpu_mem_usage:
             init_contexts.append(init_empty_weights())
 
-        if use_flash_attention_2:
-            config = cls._check_and_enable_flash_attn_2(config, torch_dtype=torch_dtype, device_map=device_map)
+        config = copy.deepcopy(config)  # We do not want to modify the config inplace in from_pretrained.
+        config = cls._autoset_attn_implementation(
+            config, use_flash_attention_2=use_flash_attention_2, torch_dtype=torch_dtype, device_map=device_map
+        )
 
         with ContextManagers(init_contexts):
+            # Let's make sure we don't run the init function of buffer modules
             model = cls(config, *model_args, **model_kwargs)
 
         # make sure we use the model's config since the __init__ call might have copied it
@@ -3315,7 +3523,7 @@ def from_pretrained(
             model = quantizer.convert_model(model)
             model._is_quantized_training_enabled = True
         elif quantization_method_from_config == QuantizationMethod.AWQ:
-            from .integrations import get_keys_to_not_convert, replace_with_awq_linear
+            from .integrations import fuse_awq_modules, get_keys_to_not_convert, replace_with_awq_linear
 
             modules_to_not_convert = get_keys_to_not_convert(model)
 
@@ -3533,6 +3741,14 @@ def from_pretrained(
                 )
                 pass
 
+        if (
+            quantization_config is not None
+            and quantization_config.quant_method == QuantizationMethod.AWQ
+            and quantization_config.do_fuse
+        ):
+            model = fuse_awq_modules(model, config.quantization_config)
+            model._awq_is_fused = True
+
         # Dispatch model with hooks on all devices if necessary
         if device_map is not None:
             device_map_kwargs = {
@@ -3873,7 +4089,18 @@ def _find_mismatched_keys(
                     ignore_mismatched_sizes,
                 )
                 if low_cpu_mem_usage:
-                    if not is_fsdp_enabled() or is_fsdp_enabled_and_dist_rank_0():
+                    if is_fsdp_enabled() and not is_local_dist_rank_0():
+                        for key, param in model_to_load.state_dict().items():
+                            if param.device == torch.device("meta"):
+                                if not (is_quantized):
+                                    set_module_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
+                                else:
+                                    set_module_quantized_tensor_to_device(
+                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
+                                    )
+                    else:
                         new_error_msgs, offload_index, state_dict_index = _load_state_dict_into_meta_model(
                             model_to_load,
                             state_dict,
@@ -3891,17 +4118,6 @@ def _find_mismatched_keys(
                             keep_in_fp32_modules=keep_in_fp32_modules,
                         )
                         error_msgs += new_error_msgs
-                    else:
-                        for key, param in model_to_load.state_dict().items():
-                            if param.device == torch.device("meta"):
-                                if not (is_quantized):
-                                    set_module_tensor_to_device(
-                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
-                                    )
-                                else:
-                                    set_module_quantized_tensor_to_device(
-                                        model_to_load, key, "cpu", torch.empty(*param.size(), dtype=dtype)
-                                    )
                 else:
                     error_msgs += _load_state_dict_into_model(model_to_load, state_dict, start_prefix)
 
diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py
index 6132512688e6b1..319c8499319a3f 100644
--- a/src/transformers/models/__init__.py
+++ b/src/transformers/models/__init__.py
@@ -119,6 +119,7 @@
     levit,
     lilt,
     llama,
+    llava,
     longformer,
     longt5,
     luke,
@@ -135,6 +136,7 @@
     megatron_gpt2,
     mgp_str,
     mistral,
+    mixtral,
     mluke,
     mobilebert,
     mobilenet_v1,
@@ -158,6 +160,8 @@
     opt,
     owlv2,
     owlvit,
+    patchtsmixer,
+    patchtst,
     pegasus,
     pegasus_x,
     perceiver,
@@ -184,6 +188,7 @@
     rwkv,
     sam,
     seamless_m4t,
+    seamless_m4t_v2,
     segformer,
     sew,
     sew_d,
@@ -204,15 +209,17 @@
     time_series_transformer,
     timesformer,
     timm_backbone,
-    transfo_xl,
     trocr,
     tvlt,
+    tvp,
     umt5,
     unispeech,
     unispeech_sat,
+    univnet,
     upernet,
     videomae,
     vilt,
+    vipllava,
     vision_encoder_decoder,
     vision_text_dual_encoder,
     visual_bert,
diff --git a/src/transformers/models/albert/modeling_flax_albert.py b/src/transformers/models/albert/modeling_flax_albert.py
index ba054cb1793293..6333f0bd3ac204 100644
--- a/src/transformers/models/albert/modeling_flax_albert.py
+++ b/src/transformers/models/albert/modeling_flax_albert.py
@@ -829,7 +829,9 @@ class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel):
     module_class = FlaxAlbertForMaskedLMModule
 
 
-append_call_sample_docstring(FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
+append_call_sample_docstring(
+    FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/11"
+)
 
 
 class FlaxAlbertForSequenceClassificationModule(nn.Module):
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index dc01c93406b791..153f7f10def694 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -77,6 +77,8 @@
         "MODEL_WITH_LM_HEAD_MAPPING",
         "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING",
         "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING",
+        "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING",
+        "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING",
         "AutoModel",
         "AutoBackbone",
         "AutoModelForAudioClassification",
@@ -250,6 +252,8 @@
             MODEL_FOR_TEXT_ENCODING_MAPPING,
             MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING,
             MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING,
+            MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+            MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
             MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
             MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING,
             MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING,
diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py
index c1c2387373b8bd..b91226ac877897 100755
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -56,6 +56,7 @@
         ("chinese_clip", "ChineseCLIPConfig"),
         ("clap", "ClapConfig"),
         ("clip", "CLIPConfig"),
+        ("clip_vision_model", "CLIPVisionConfig"),
         ("clipseg", "CLIPSegConfig"),
         ("clvp", "ClvpConfig"),
         ("code_llama", "LlamaConfig"),
@@ -126,6 +127,7 @@
         ("levit", "LevitConfig"),
         ("lilt", "LiltConfig"),
         ("llama", "LlamaConfig"),
+        ("llava", "LlavaConfig"),
         ("longformer", "LongformerConfig"),
         ("longt5", "LongT5Config"),
         ("luke", "LukeConfig"),
@@ -142,6 +144,7 @@
         ("megatron-bert", "MegatronBertConfig"),
         ("mgp-str", "MgpstrConfig"),
         ("mistral", "MistralConfig"),
+        ("mixtral", "MixtralConfig"),
         ("mobilebert", "MobileBertConfig"),
         ("mobilenet_v1", "MobileNetV1Config"),
         ("mobilenet_v2", "MobileNetV2Config"),
@@ -164,6 +167,8 @@
         ("opt", "OPTConfig"),
         ("owlv2", "Owlv2Config"),
         ("owlvit", "OwlViTConfig"),
+        ("patchtsmixer", "PatchTSMixerConfig"),
+        ("patchtst", "PatchTSTConfig"),
         ("pegasus", "PegasusConfig"),
         ("pegasus_x", "PegasusXConfig"),
         ("perceiver", "PerceiverConfig"),
@@ -190,6 +195,7 @@
         ("rwkv", "RwkvConfig"),
         ("sam", "SamConfig"),
         ("seamless_m4t", "SeamlessM4TConfig"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2Config"),
         ("segformer", "SegformerConfig"),
         ("sew", "SEWConfig"),
         ("sew-d", "SEWDConfig"),
@@ -214,13 +220,16 @@
         ("transfo-xl", "TransfoXLConfig"),
         ("trocr", "TrOCRConfig"),
         ("tvlt", "TvltConfig"),
+        ("tvp", "TvpConfig"),
         ("umt5", "UMT5Config"),
         ("unispeech", "UniSpeechConfig"),
         ("unispeech-sat", "UniSpeechSatConfig"),
+        ("univnet", "UnivNetConfig"),
         ("upernet", "UperNetConfig"),
         ("van", "VanConfig"),
         ("videomae", "VideoMAEConfig"),
         ("vilt", "ViltConfig"),
+        ("vipllava", "VipLlavaConfig"),
         ("vision-encoder-decoder", "VisionEncoderDecoderConfig"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"),
         ("visual_bert", "VisualBertConfig"),
@@ -343,6 +352,7 @@
         ("levit", "LEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("lilt", "LILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("llama", "LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("llava", "LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("longformer", "LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("longt5", "LONGT5_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("luke", "LUKE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -357,6 +367,7 @@
         ("megatron-bert", "MEGATRON_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mgp-str", "MGP_STR_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mistral", "MISTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("mixtral", "MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v1", "MOBILENET_V1_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilenet_v2", "MOBILENET_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("mobilevit", "MOBILEVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -376,6 +387,8 @@
         ("opt", "OPT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("owlv2", "OWLV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("owlvit", "OWLVIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtsmixer", "PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("patchtst", "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus", "PEGASUS_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("pegasus_x", "PEGASUS_X_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("perceiver", "PERCEIVER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -400,6 +413,7 @@
         ("rwkv", "RWKV_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sam", "SAM_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("seamless_m4t", "SEAMLESS_M4T_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("seamless_m4t_v2", "SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("segformer", "SEGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew", "SEW_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("sew-d", "SEW_D_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -420,11 +434,14 @@
         ("timesformer", "TIMESFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("transfo-xl", "TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("tvlt", "TVLT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("tvp", "TVP_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech", "UNISPEECH_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("unispeech-sat", "UNISPEECH_SAT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("univnet", "UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("van", "VAN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("videomae", "VIDEOMAE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vilt", "VILT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
+        ("vipllava", "VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit", "VIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
         ("vit_hybrid", "VIT_HYBRID_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@@ -484,6 +501,7 @@
         ("chinese_clip", "Chinese-CLIP"),
         ("clap", "CLAP"),
         ("clip", "CLIP"),
+        ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSeg"),
         ("clvp", "CLVP"),
         ("code_llama", "CodeLlama"),
@@ -563,11 +581,13 @@
         ("lilt", "LiLT"),
         ("llama", "LLaMA"),
         ("llama2", "Llama2"),
+        ("llava", "LLaVa"),
         ("longformer", "Longformer"),
         ("longt5", "LongT5"),
         ("luke", "LUKE"),
         ("lxmert", "LXMERT"),
         ("m2m_100", "M2M100"),
+        ("madlad-400", "MADLAD-400"),
         ("marian", "Marian"),
         ("markuplm", "MarkupLM"),
         ("mask2former", "Mask2Former"),
@@ -582,6 +602,7 @@
         ("megatron_gpt2", "Megatron-GPT2"),
         ("mgp-str", "MGP-STR"),
         ("mistral", "Mistral"),
+        ("mixtral", "Mixtral"),
         ("mluke", "mLUKE"),
         ("mms", "MMS"),
         ("mobilebert", "MobileBERT"),
@@ -607,6 +628,8 @@
         ("opt", "OPT"),
         ("owlv2", "OWLv2"),
         ("owlvit", "OWL-ViT"),
+        ("patchtsmixer", "PatchTSMixer"),
+        ("patchtst", "PatchTST"),
         ("pegasus", "Pegasus"),
         ("pegasus_x", "PEGASUS-X"),
         ("perceiver", "Perceiver"),
@@ -634,6 +657,7 @@
         ("rwkv", "RWKV"),
         ("sam", "SAM"),
         ("seamless_m4t", "SeamlessM4T"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2"),
         ("segformer", "SegFormer"),
         ("sew", "SEW"),
         ("sew-d", "SEW-D"),
@@ -660,14 +684,17 @@
         ("transfo-xl", "Transformer-XL"),
         ("trocr", "TrOCR"),
         ("tvlt", "TVLT"),
+        ("tvp", "TVP"),
         ("ul2", "UL2"),
         ("umt5", "UMT5"),
         ("unispeech", "UniSpeech"),
         ("unispeech-sat", "UniSpeechSat"),
+        ("univnet", "UnivNet"),
         ("upernet", "UPerNet"),
         ("van", "VAN"),
         ("videomae", "VideoMAE"),
         ("vilt", "ViLT"),
+        ("vipllava", "VipLlava"),
         ("vision-encoder-decoder", "Vision Encoder decoder"),
         ("vision-text-dual-encoder", "VisionTextDualEncoder"),
         ("visual_bert", "VisualBERT"),
@@ -700,6 +727,8 @@
     ]
 )
 
+# This is tied to the processing `-` -> `_` in `model_type_to_module_name`. For example, instead of putting
+# `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`.
 DEPRECATED_MODELS = [
     "bort",
     "mctct",
@@ -708,6 +737,7 @@
     "retribert",
     "tapex",
     "trajectory_transformer",
+    "transfo_xl",
     "van",
 ]
 
@@ -721,6 +751,7 @@
         ("kosmos-2", "kosmos2"),
         ("maskformer-swin", "maskformer"),
         ("xclip", "x_clip"),
+        ("clip_vision_model", "clip"),
     ]
 )
 
@@ -1067,7 +1098,14 @@ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
                 config_class.register_for_auto_class()
             return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif "model_type" in config_dict:
-            config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            try:
+                config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            except KeyError:
+                raise ValueError(
+                    f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` "
+                    "but Transformers does not recognize this architecture. This could be because of an "
+                    "issue with the checkpoint, or because your version of Transformers is out of date."
+                )
             return config_class.from_dict(config_dict, **unused_kwargs)
         else:
             # Fallback: use pattern matching on the string.
diff --git a/src/transformers/models/auto/feature_extraction_auto.py b/src/transformers/models/auto/feature_extraction_auto.py
index 2c269950264262..457217566e7cfa 100644
--- a/src/transformers/models/auto/feature_extraction_auto.py
+++ b/src/transformers/models/auto/feature_extraction_auto.py
@@ -78,6 +78,7 @@
         ("regnet", "ConvNextFeatureExtractor"),
         ("resnet", "ConvNextFeatureExtractor"),
         ("seamless_m4t", "SeamlessM4TFeatureExtractor"),
+        ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"),
         ("segformer", "SegformerFeatureExtractor"),
         ("sew", "Wav2Vec2FeatureExtractor"),
         ("sew-d", "Wav2Vec2FeatureExtractor"),
@@ -91,6 +92,7 @@
         ("tvlt", "TvltFeatureExtractor"),
         ("unispeech", "Wav2Vec2FeatureExtractor"),
         ("unispeech-sat", "Wav2Vec2FeatureExtractor"),
+        ("univnet", "UnivNetFeatureExtractor"),
         ("van", "ConvNextFeatureExtractor"),
         ("videomae", "VideoMAEFeatureExtractor"),
         ("vilt", "ViltFeatureExtractor"),
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
index f52d18a7076777..446c9adf1b6dc3 100644
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@@ -71,9 +71,11 @@
         ("idefics", "IdeficsImageProcessor"),
         ("imagegpt", "ImageGPTImageProcessor"),
         ("instructblip", "BlipImageProcessor"),
+        ("kosmos-2", "CLIPImageProcessor"),
         ("layoutlmv2", "LayoutLMv2ImageProcessor"),
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),
         ("levit", "LevitImageProcessor"),
+        ("llava", "CLIPImageProcessor"),
         ("mask2former", "Mask2FormerImageProcessor"),
         ("maskformer", "MaskFormerImageProcessor"),
         ("mgp-str", "ViTImageProcessor"),
@@ -102,10 +104,12 @@
         ("table-transformer", "DetrImageProcessor"),
         ("timesformer", "VideoMAEImageProcessor"),
         ("tvlt", "TvltImageProcessor"),
+        ("tvp", "TvpImageProcessor"),
         ("upernet", "SegformerImageProcessor"),
         ("van", "ConvNextImageProcessor"),
         ("videomae", "VideoMAEImageProcessor"),
         ("vilt", "ViltImageProcessor"),
+        ("vipllava", "CLIPImageProcessor"),
         ("vit", "ViTImageProcessor"),
         ("vit_hybrid", "ViTHybridImageProcessor"),
         ("vit_mae", "ViTImageProcessor"),
diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py
index ffcae9a234942c..e562bd28bdb3f3 100755
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -18,7 +18,12 @@
 from collections import OrderedDict
 
 from ...utils import logging
-from .auto_factory import _BaseAutoBackboneClass, _BaseAutoModelClass, _LazyAutoMapping, auto_class_update
+from .auto_factory import (
+    _BaseAutoBackboneClass,
+    _BaseAutoModelClass,
+    _LazyAutoMapping,
+    auto_class_update,
+)
 from .configuration_auto import CONFIG_MAPPING_NAMES
 
 
@@ -54,6 +59,7 @@
         ("chinese_clip", "ChineseCLIPModel"),
         ("clap", "ClapModel"),
         ("clip", "CLIPModel"),
+        ("clip_vision_model", "CLIPVisionModel"),
         ("clipseg", "CLIPSegModel"),
         ("clvp", "ClvpModelForConditionalGeneration"),
         ("code_llama", "LlamaModel"),
@@ -137,6 +143,7 @@
         ("megatron-bert", "MegatronBertModel"),
         ("mgp-str", "MgpstrForSceneTextRecognition"),
         ("mistral", "MistralModel"),
+        ("mixtral", "MixtralModel"),
         ("mobilebert", "MobileBertModel"),
         ("mobilenet_v1", "MobileNetV1Model"),
         ("mobilenet_v2", "MobileNetV2Model"),
@@ -157,6 +164,8 @@
         ("opt", "OPTModel"),
         ("owlv2", "Owlv2Model"),
         ("owlvit", "OwlViTModel"),
+        ("patchtsmixer", "PatchTSMixerModel"),
+        ("patchtst", "PatchTSTModel"),
         ("pegasus", "PegasusModel"),
         ("pegasus_x", "PegasusXModel"),
         ("perceiver", "PerceiverModel"),
@@ -179,6 +188,7 @@
         ("rwkv", "RwkvModel"),
         ("sam", "SamModel"),
         ("seamless_m4t", "SeamlessM4TModel"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2Model"),
         ("segformer", "SegformerModel"),
         ("sew", "SEWModel"),
         ("sew-d", "SEWDModel"),
@@ -200,9 +210,11 @@
         ("trajectory_transformer", "TrajectoryTransformerModel"),
         ("transfo-xl", "TransfoXLModel"),
         ("tvlt", "TvltModel"),
+        ("tvp", "TvpModel"),
         ("umt5", "UMT5Model"),
         ("unispeech", "UniSpeechModel"),
         ("unispeech-sat", "UniSpeechSatModel"),
+        ("univnet", "UnivNetModel"),
         ("van", "VanModel"),
         ("videomae", "VideoMAEModel"),
         ("vilt", "ViltModel"),
@@ -260,6 +272,7 @@
         ("ibert", "IBertForMaskedLM"),
         ("idefics", "IdeficsForVisionText2Text"),
         ("layoutlm", "LayoutLMForMaskedLM"),
+        ("llava", "LlavaForConditionalGeneration"),
         ("longformer", "LongformerForMaskedLM"),
         ("luke", "LukeForMaskedLM"),
         ("lxmert", "LxmertForPreTraining"),
@@ -288,6 +301,7 @@
         ("unispeech", "UniSpeechForPreTraining"),
         ("unispeech-sat", "UniSpeechSatForPreTraining"),
         ("videomae", "VideoMAEForPreTraining"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
         ("visual_bert", "VisualBertForPreTraining"),
         ("vit_mae", "ViTMAEForPreTraining"),
         ("wav2vec2", "Wav2Vec2ForPreTraining"),
@@ -419,6 +433,7 @@
         ("mega", "MegaForCausalLM"),
         ("megatron-bert", "MegatronBertForCausalLM"),
         ("mistral", "MistralForCausalLM"),
+        ("mixtral", "MixtralForCausalLM"),
         ("mpt", "MptForCausalLM"),
         ("musicgen", "MusicgenForCausalLM"),
         ("mvp", "MvpForCausalLM"),
@@ -479,7 +494,10 @@
         ("convnextv2", "ConvNextV2ForImageClassification"),
         ("cvt", "CvtForImageClassification"),
         ("data2vec-vision", "Data2VecVisionForImageClassification"),
-        ("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")),
+        (
+            "deit",
+            ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher"),
+        ),
         ("dinat", "DinatForImageClassification"),
         ("dinov2", "Dinov2ForImageClassification"),
         (
@@ -492,7 +510,10 @@
         ("efficientnet", "EfficientNetForImageClassification"),
         ("focalnet", "FocalNetForImageClassification"),
         ("imagegpt", "ImageGPTForImageClassification"),
-        ("levit", ("LevitForImageClassification", "LevitForImageClassificationWithTeacher")),
+        (
+            "levit",
+            ("LevitForImageClassification", "LevitForImageClassificationWithTeacher"),
+        ),
         ("mobilenet_v1", "MobileNetV1ForImageClassification"),
         ("mobilenet_v2", "MobileNetV2ForImageClassification"),
         ("mobilevit", "MobileViTForImageClassification"),
@@ -576,7 +597,9 @@
         ("git", "GitForCausalLM"),
         ("instructblip", "InstructBlipForConditionalGeneration"),
         ("kosmos-2", "Kosmos2ForConditionalGeneration"),
+        ("llava", "LlavaForConditionalGeneration"),
         ("pix2struct", "Pix2StructForConditionalGeneration"),
+        ("vipllava", "VipLlavaForConditionalGeneration"),
         ("vision-encoder-decoder", "VisionEncoderDecoderModel"),
     ]
 )
@@ -682,6 +705,7 @@
         ("plbart", "PLBartForConditionalGeneration"),
         ("prophetnet", "ProphetNetForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForTextToText"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToText"),
         ("switch_transformers", "SwitchTransformersForConditionalGeneration"),
         ("t5", "T5ForConditionalGeneration"),
         ("umt5", "UMT5ForConditionalGeneration"),
@@ -693,6 +717,7 @@
     [
         ("pop2piano", "Pop2PianoForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForSpeechToText"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2ForSpeechToText"),
         ("speech-encoder-decoder", "SpeechEncoderDecoderModel"),
         ("speech_to_text", "Speech2TextForConditionalGeneration"),
         ("speecht5", "SpeechT5ForSpeechToText"),
@@ -747,6 +772,7 @@
         ("mega", "MegaForSequenceClassification"),
         ("megatron-bert", "MegatronBertForSequenceClassification"),
         ("mistral", "MistralForSequenceClassification"),
+        ("mixtral", "MixtralForSequenceClassification"),
         ("mobilebert", "MobileBertForSequenceClassification"),
         ("mpnet", "MPNetForSequenceClassification"),
         ("mpt", "MptForSequenceClassification"),
@@ -1059,6 +1085,7 @@
         ("bark", "BarkModel"),
         ("musicgen", "MusicgenForConditionalGeneration"),
         ("seamless_m4t", "SeamlessM4TForTextToSpeech"),
+        ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"),
         ("vits", "VitsModel"),
     ]
 )
@@ -1078,6 +1105,7 @@
 MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict(
     [
         # Backbone mapping
+        ("beit", "BeitBackbone"),
         ("bit", "BitBackbone"),
         ("convnext", "ConvNextBackbone"),
         ("convnextv2", "ConvNextV2Backbone"),
@@ -1130,6 +1158,20 @@
     ]
 )
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
+    [
+        ("patchtsmixer", "PatchTSMixerForTimeSeriesClassification"),
+        ("patchtst", "PatchTSTForClassification"),
+    ]
+)
+
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES = OrderedDict(
+    [
+        ("patchtsmixer", "PatchTSMixerForRegression"),
+        ("patchtst", "PatchTSTForRegression"),
+    ]
+)
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict(
     [
         ("swin2sr", "Swin2SRForImageSuperResolution"),
@@ -1221,6 +1263,14 @@
 
 MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES)
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES
+)
+
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = _LazyAutoMapping(
+    CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES
+)
+
 MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
 
 
@@ -1277,7 +1327,9 @@ class AutoModelForSeq2SeqLM(_BaseAutoModelClass):
 
 
 AutoModelForSeq2SeqLM = auto_class_update(
-    AutoModelForSeq2SeqLM, head_doc="sequence-to-sequence language modeling", checkpoint_for_example="t5-base"
+    AutoModelForSeq2SeqLM,
+    head_doc="sequence-to-sequence language modeling",
+    checkpoint_for_example="t5-base",
 )
 
 
diff --git a/src/transformers/models/auto/modeling_flax_auto.py b/src/transformers/models/auto/modeling_flax_auto.py
index ebc768963429c1..bf7d87e4e2dbd4 100644
--- a/src/transformers/models/auto/modeling_flax_auto.py
+++ b/src/transformers/models/auto/modeling_flax_auto.py
@@ -43,6 +43,7 @@
         ("gpt2", "FlaxGPT2Model"),
         ("gpt_neo", "FlaxGPTNeoModel"),
         ("gptj", "FlaxGPTJModel"),
+        ("llama", "FlaxLlamaModel"),
         ("longt5", "FlaxLongT5Model"),
         ("marian", "FlaxMarianModel"),
         ("mbart", "FlaxMBartModel"),
@@ -146,6 +147,7 @@
         ("gpt2", "FlaxGPT2LMHeadModel"),
         ("gpt_neo", "FlaxGPTNeoForCausalLM"),
         ("gptj", "FlaxGPTJForCausalLM"),
+        ("llama", "FlaxLlamaForCausalLM"),
         ("opt", "FlaxOPTForCausalLM"),
         ("roberta", "FlaxRobertaForCausalLM"),
         ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"),
diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py
index 84f7ba3be5bf4d..93dc6ab6050bb9 100644
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@@ -64,6 +64,7 @@
         ("kosmos-2", "Kosmos2Processor"),
         ("layoutlmv2", "LayoutLMv2Processor"),
         ("layoutlmv3", "LayoutLMv3Processor"),
+        ("llava", "LlavaProcessor"),
         ("markuplm", "MarkupLMProcessor"),
         ("mctct", "MCTCTProcessor"),
         ("mgp-str", "MgpstrProcessor"),
@@ -81,9 +82,11 @@
         ("speecht5", "SpeechT5Processor"),
         ("trocr", "TrOCRProcessor"),
         ("tvlt", "TvltProcessor"),
+        ("tvp", "TvpProcessor"),
         ("unispeech", "Wav2Vec2Processor"),
         ("unispeech-sat", "Wav2Vec2Processor"),
         ("vilt", "ViltProcessor"),
+        ("vipllava", "LlavaProcessor"),
         ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"),
         ("wav2vec2", "Wav2Vec2Processor"),
         ("wav2vec2-conformer", "Wav2Vec2Processor"),
diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py
index f30bc9ad619524..9e4066de99a5f9 100644
--- a/src/transformers/models/auto/tokenization_auto.py
+++ b/src/transformers/models/auto/tokenization_auto.py
@@ -203,6 +203,7 @@
                     "LlamaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "longt5",
@@ -239,6 +240,13 @@
                     "LlamaTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "mixtral",
+                (
+                    "LlamaTokenizer" if is_sentencepiece_available() else None,
+                    "LlamaTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)),
             ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)),
             ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)),
@@ -346,6 +354,13 @@
                     "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
                 ),
             ),
+            (
+                "seamless_m4t_v2",
+                (
+                    "SeamlessM4TTokenizer" if is_sentencepiece_available() else None,
+                    "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None,
+                ),
+            ),
             ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)),
             ("speech_to_text_2", ("Speech2Text2Tokenizer", None)),
             ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)),
@@ -371,6 +386,7 @@
             ("tapas", ("TapasTokenizer", None)),
             ("tapex", ("TapexTokenizer", None)),
             ("transfo-xl", ("TransfoXLTokenizer", None)),
+            ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             (
                 "umt5",
                 (
@@ -379,6 +395,7 @@
                 ),
             ),
             ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
+            ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)),
             ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)),
             ("vits", ("VitsTokenizer", None)),
             ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)),
diff --git a/src/transformers/models/autoformer/modeling_autoformer.py b/src/transformers/models/autoformer/modeling_autoformer.py
index 92e9df2c7e5b1b..3fb9fac5caaa5f 100644
--- a/src/transformers/models/autoformer/modeling_autoformer.py
+++ b/src/transformers/models/autoformer/modeling_autoformer.py
@@ -208,71 +208,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -300,26 +299,29 @@ def forward(
         return scaled_data, torch.zeros_like(scale), scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Autoformer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer
 class AutoformerNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: AutoformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -1433,11 +1435,11 @@ def __init__(self, config: AutoformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = AutoformerMeanScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = AutoformerStdScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerStdScaler(config)
         else:
-            self.scaler = AutoformerNOPScaler(dim=1, keepdim=True)
+            self.scaler = AutoformerNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = AutoformerFeatureEmbedder(
diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py
index f8b9eab5d397b3..703886d500ba12 100644
--- a/src/transformers/models/bark/modeling_bark.py
+++ b/src/transformers/models/bark/modeling_bark.py
@@ -34,6 +34,7 @@
     add_start_docstrings_to_model_forward,
     is_accelerate_available,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from ..auto import AutoModel
@@ -214,6 +215,15 @@ class BarkSelfFlashAttention2(BarkSelfAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def _split_heads(self, tensor, num_heads, attn_head_size):
         """
         Splits hidden_size dim into attn_head_size and num_heads
@@ -301,6 +311,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -321,13 +337,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -373,7 +389,7 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 
 
 BARK_ATTENTION_CLASSES = {
-    "default": BarkSelfAttention,
+    "eager": BarkSelfAttention,
     "flash_attention_2": BarkSelfFlashAttention2,
 }
 
@@ -420,8 +436,7 @@ def __init__(self, config, is_causal=False):
             self.layernorm_1 = nn.LayerNorm(config.hidden_size)
             self.layernorm_2 = nn.LayerNorm(config.hidden_size)
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.attn = BARK_ATTENTION_CLASSES[attn_type](config, is_causal=is_causal)
+        self.attn = BARK_ATTENTION_CLASSES[config._attn_implementation](config, is_causal=is_causal)
 
         self.mlp = BarkMLP(config)
 
@@ -654,6 +669,7 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.dropout)
 
         self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias)
 
@@ -789,7 +805,7 @@ def forward(
         if attention_mask is not None:
             if batch_size <= 0:
                 raise ValueError("batch_size has to be defined and > 0")
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
             else:
                 attention_mask = attention_mask.view(batch_size, -1)
@@ -1249,6 +1265,7 @@ def __init__(self, config):
         self.drop = nn.Dropout(config.dropout)
 
         self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         self.layernorm_final = nn.LayerNorm(config.hidden_size)
 
@@ -1418,7 +1435,7 @@ def forward(
         if attention_mask is not None:
             if batch_size <= 0:
                 raise ValueError("batch_size has to be defined and > 0")
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
             else:
                 # [bsz, to_seq_length] -> [bsz, 1, 1, to_seq_length]
@@ -1859,7 +1876,11 @@ def generate(
 
     @classmethod
     def _check_and_enable_flash_attn_2(
-        cls, config, torch_dtype: Optional[torch.dtype] = None, device_map: Optional[Union[str, Dict[str, int]]] = None
+        cls,
+        config,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[Union[str, Dict[str, int]]] = None,
+        hard_check_only: bool = False,
     ):
         """
         `_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model
@@ -1876,12 +1897,14 @@ def _check_and_enable_flash_attn_2(
         The method checks if the current setup is compatible with Flash Attention as it requires the model to be in
         half precision and not ran on CPU.
 
-        If all checks pass, the method will create an attribute in the config `_flash_attn_2_enabled` so that the model
+        If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model
         can initialize the correct attention module
         """
-        config = super()._check_and_enable_flash_attn_2(config, torch_dtype, device_map)
+        config = super()._check_and_enable_flash_attn_2(
+            config, torch_dtype, device_map, hard_check_only=hard_check_only
+        )
 
-        config.semantic_config._flash_attn_2_enabled = getattr(config, "_flash_attn_2_enabled", False)
-        config.coarse_acoustics_config._flash_attn_2_enabled = getattr(config, "_flash_attn_2_enabled", False)
-        config.fine_acoustics_config._flash_attn_2_enabled = getattr(config, "_flash_attn_2_enabled", False)
+        config.semantic_config._attn_implementation = config._attn_implementation
+        config.coarse_acoustics_config._attn_implementation = config._attn_implementation
+        config.fine_acoustics_config._attn_implementation = config._attn_implementation
         return config
diff --git a/src/transformers/models/bart/modeling_bart.py b/src/transformers/models/bart/modeling_bart.py
index efca985f67842e..622264fbe0f98d 100755
--- a/src/transformers/models/bart/modeling_bart.py
+++ b/src/transformers/models/bart/modeling_bart.py
@@ -25,7 +25,12 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -42,6 +47,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -294,6 +300,15 @@ class BartFlashAttention2(BartAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
 
@@ -418,6 +433,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -438,13 +459,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -489,8 +510,111 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+class BartSdpaAttention(BartAttention):
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz)
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 BART_ATTENTION_CLASSES = {
-    "default": BartAttention,
+    "eager": BartAttention,
+    "sdpa": BartSdpaAttention,
     "flash_attention_2": BartFlashAttention2,
 }
 
@@ -499,9 +623,8 @@ class BartEncoderLayer(nn.Module):
     def __init__(self, config: BartConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = BART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -571,8 +694,7 @@ def __init__(self, config: BartConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.self_attn = BART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -585,7 +707,7 @@ def __init__(self, config: BartConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = BART_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = BART_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -719,6 +841,7 @@ class BartPreTrainedModel(PreTrainedModel):
     _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -945,6 +1068,8 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
             embed_dim,
         )
         self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         self.gradient_checkpointing = False
@@ -1032,8 +1157,13 @@ def forward(
 
         # expand attention_mask
         if attention_mask is not None:
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
+            elif self._use_sdpa and head_mask is None and not output_attentions:
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
@@ -1120,6 +1250,9 @@ def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = No
             config.d_model,
         )
         self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
@@ -1238,9 +1371,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input) * self.embed_scale
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
+            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                input_shape,
+                inputs_embeds,
+                past_key_values_length,
+            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
@@ -1249,8 +1391,17 @@ def forward(
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 encoder_attention_mask = _prepare_4d_attention_mask(
diff --git a/src/transformers/models/bart/modeling_tf_bart.py b/src/transformers/models/bart/modeling_tf_bart.py
index 497dad4249113c..b04e3ed99788e9 100644
--- a/src/transformers/models/bart/modeling_tf_bart.py
+++ b/src/transformers/models/bart/modeling_tf_bart.py
@@ -494,6 +494,12 @@ def dummy_inputs(self):
             dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2
         return dummy_inputs
 
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "model.shared.weight":
+            return tf_weight, "model.decoder.embed_tokens.weight"
+        else:
+            return (tf_weight,)
+
 
 BART_START_DOCSTRING = r"""
     This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
diff --git a/src/transformers/models/beit/__init__.py b/src/transformers/models/beit/__init__.py
index 4b631ac1c36aae..ce399f92e0fa4d 100644
--- a/src/transformers/models/beit/__init__.py
+++ b/src/transformers/models/beit/__init__.py
@@ -47,6 +47,7 @@
         "BeitForSemanticSegmentation",
         "BeitModel",
         "BeitPreTrainedModel",
+        "BeitBackbone",
     ]
 
 
@@ -83,6 +84,7 @@
     else:
         from .modeling_beit import (
             BEIT_PRETRAINED_MODEL_ARCHIVE_LIST,
+            BeitBackbone,
             BeitForImageClassification,
             BeitForMaskedImageModeling,
             BeitForSemanticSegmentation,
diff --git a/src/transformers/models/beit/configuration_beit.py b/src/transformers/models/beit/configuration_beit.py
index 102718e17ff13f..b30f4e893fc518 100644
--- a/src/transformers/models/beit/configuration_beit.py
+++ b/src/transformers/models/beit/configuration_beit.py
@@ -21,6 +21,7 @@
 from ...configuration_utils import PretrainedConfig
 from ...onnx import OnnxConfig
 from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
 
 
 logger = logging.get_logger(__name__)
@@ -33,7 +34,7 @@
 }
 
 
-class BeitConfig(PretrainedConfig):
+class BeitConfig(BackboneConfigMixin, PretrainedConfig):
     r"""
     This is the configuration class to store the configuration of a [`BeitModel`]. It is used to instantiate an BEiT
     model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
@@ -84,8 +85,6 @@ class BeitConfig(PretrainedConfig):
         use_mean_pooling (`bool`, *optional*, defaults to `True`):
             Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
             CLS token, before applying the classification head.
-        out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
-            Indices of the feature maps to use for semantic segmentation.
         pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
             Pooling scales used in Pooling Pyramid Module applied on the last feature map.
         use_auxiliary_head (`bool`, *optional*, defaults to `True`):
@@ -100,6 +99,20 @@ class BeitConfig(PretrainedConfig):
             Whether to concatenate the output of the auxiliary head with the input before the classification layer.
         semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
             The index that is ignored by the loss function of the semantic segmentation model.
+        out_features (`List[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage.
+        out_indices (`List[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage.
+        add_fpn (`bool`, *optional*, defaults to `False`):
+            Whether to add a FPN as part of the backbone. Only relevant for [`BeitBackbone`].
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`. Only relevant for [`BeitBackbone`].
 
     Example:
 
@@ -140,7 +153,6 @@ def __init__(
         layer_scale_init_value=0.1,
         drop_path_rate=0.1,
         use_mean_pooling=True,
-        out_indices=[3, 5, 7, 11],
         pool_scales=[1, 2, 3, 6],
         use_auxiliary_head=True,
         auxiliary_loss_weight=0.4,
@@ -148,6 +160,10 @@ def __init__(
         auxiliary_num_convs=1,
         auxiliary_concat_input=False,
         semantic_loss_ignore_index=255,
+        out_features=None,
+        out_indices=None,
+        add_fpn=False,
+        reshape_hidden_states=True,
         **kwargs,
     ):
         super().__init__(**kwargs)
@@ -174,7 +190,6 @@ def __init__(
         self.drop_path_rate = drop_path_rate
         self.use_mean_pooling = use_mean_pooling
         # decode head attributes (semantic segmentation)
-        self.out_indices = out_indices
         self.pool_scales = pool_scales
         # auxiliary head attributes (semantic segmentation)
         self.use_auxiliary_head = use_auxiliary_head
@@ -184,6 +199,22 @@ def __init__(
         self.auxiliary_concat_input = auxiliary_concat_input
         self.semantic_loss_ignore_index = semantic_loss_ignore_index
 
+        # handle backwards compatibility
+        if "segmentation_indices" in kwargs:
+            logger.warning(
+                "The `segmentation_indices` argument is deprecated and will be removed in a future version, use `out_indices` instead.",
+                FutureWarning,
+            )
+            out_indices = kwargs.pop("segmentation_indices")
+
+        # backbone attributes
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.add_fpn = add_fpn
+        self.reshape_hidden_states = reshape_hidden_states
+
 
 # Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
 class BeitOnnxConfig(OnnxConfig):
diff --git a/src/transformers/models/beit/modeling_beit.py b/src/transformers/models/beit/modeling_beit.py
index abc0d3158f58a8..da4721656c0285 100755
--- a/src/transformers/models/beit/modeling_beit.py
+++ b/src/transformers/models/beit/modeling_beit.py
@@ -22,11 +22,12 @@
 
 import torch
 import torch.utils.checkpoint
-from torch import nn
+from torch import Tensor, nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
 from ...modeling_outputs import (
+    BackboneOutput,
     BaseModelOutput,
     BaseModelOutputWithPooling,
     ImageClassifierOutput,
@@ -42,6 +43,7 @@
     logging,
     replace_return_docstrings,
 )
+from ...utils.backbone_utils import BackboneMixin
 from .configuration_beit import BeitConfig
 
 
@@ -149,22 +151,26 @@ def __init__(self, config: BeitConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        embeddings = self.patch_embeddings(pixel_values)
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(
+            pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
+        )
         batch_size, seq_len, _ = embeddings.size()
 
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         if bool_masked_pos is not None:
             mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
             # replace the masked visual tokens by mask_tokens
             w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
             embeddings = embeddings * (1 - w) + mask_tokens * w
 
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         if self.position_embeddings is not None:
-            embeddings = embeddings + self.position_embeddings
+            cls_tokens = cls_tokens + self.position_embeddings[:, :1, :]
+
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
         embeddings = self.dropout(embeddings)
 
-        return embeddings
+        return embeddings, (patch_height, patch_width)
 
 
 class BeitPatchEmbeddings(nn.Module):
@@ -191,19 +197,29 @@ def __init__(self, config):
 
         self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor, position_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(
+                0, 3, 1, 2
+            )
+            position_embedding = nn.functional.interpolate(
+                position_embedding, size=(patch_height, patch_width), mode="bicubic"
             )
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+            embeddings = embeddings + position_embedding
 
-        return embeddings
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
 
 
 class BeitSelfAttention(nn.Module):
@@ -669,7 +685,7 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(pixel_values, bool_masked_pos)
+        embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values, bool_masked_pos)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -1151,6 +1167,12 @@ def __init__(self, config: BeitConfig) -> None:
         self.beit = BeitModel(config, add_pooling_layer=False)
 
         # FPNs
+        if len(self.config.out_indices) != 4:
+            raise ValueError(
+                "BeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, "
+                "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
+                "a base-sized architecture."
+            )
         self.fpn1 = nn.Sequential(
             nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
             nn.BatchNorm2d(config.hidden_size),
@@ -1280,3 +1302,126 @@ def forward(
             hidden_states=outputs.hidden_states if output_hidden_states else None,
             attentions=outputs.attentions,
         )
+
+
+@add_start_docstrings(
+    """
+    BEiT backbone, to be used with frameworks like DETR and MaskFormer.
+    """,
+    BEIT_START_DOCSTRING,
+)
+class BeitBackbone(BeitPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = BeitEmbeddings(config)
+        self.encoder = BeitEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape)
+
+        if config.add_fpn:
+            if len(self.config.out_indices) != 4:
+                raise ValueError(
+                    "BeitBackbone requires config.out_indices to be a list of 4 integers, "
+                    "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
+                    "a base-sized architecture."
+                )
+            hidden_size = config.hidden_size
+            self.fpn1 = nn.Sequential(
+                nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2),
+                nn.BatchNorm2d(hidden_size, eps=config.batch_norm_eps),
+                nn.GELU(),
+                nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2),
+            )
+
+            self.fpn2 = nn.Sequential(nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2))
+            self.fpn3 = nn.Identity()
+            self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 14, 14]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        batch_size = pixel_values.shape[0]
+        embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output, output_hidden_states=True, output_attentions=output_attentions, return_dict=return_dict
+        )
+
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1:, :]
+                    hidden_state = hidden_state.permute(0, 2, 1)
+                    hidden_state = hidden_state.reshape(batch_size, -1, patch_height, patch_width)
+
+                feature_maps += (hidden_state,)
+
+        if self.config.add_fpn:
+            feature_maps = [
+                self.fpn1(feature_maps[0]),
+                self.fpn2(feature_maps[1]),
+                self.fpn3(feature_maps[2]),
+                self.fpn4(feature_maps[3]),
+            ]
+            feature_maps = tuple(feature_maps)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (feature_maps,) + outputs[1:]
+            else:
+                output = (feature_maps,) + outputs[2:]
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/big_bird/modeling_big_bird.py b/src/transformers/models/big_bird/modeling_big_bird.py
index d90aa443dad209..008985f760e867 100755
--- a/src/transformers/models/big_bird/modeling_big_bird.py
+++ b/src/transformers/models/big_bird/modeling_big_bird.py
@@ -2223,7 +2223,7 @@ def _pad_to_block_size(
 
         padding_len = (block_size - seq_len % block_size) % block_size
         if padding_len > 0:
-            logger.info(
+            logger.warning_once(
                 f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                 f"`config.block_size`: {block_size}"
             )
diff --git a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
index bc9682786942aa..245208d73d32dd 100755
--- a/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
+++ b/src/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py
@@ -2014,7 +2014,7 @@ def _pad_to_block_size(self, hidden_states: torch.Tensor, attention_mask: torch.
 
         padding_len = (block_size - seq_len % block_size) % block_size
         if padding_len > 0:
-            logger.info(
+            logger.warning_once(
                 f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                 f"`config.block_size`: {block_size}"
             )
diff --git a/src/transformers/models/bit/image_processing_bit.py b/src/transformers/models/bit/image_processing_bit.py
index 235f55afad7884..7aa49145ae0527 100644
--- a/src/transformers/models/bit/image_processing_bit.py
+++ b/src/transformers/models/bit/image_processing_bit.py
@@ -84,10 +84,6 @@ class BitImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -105,12 +101,11 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
         crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
 
@@ -125,7 +120,6 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
@@ -153,13 +147,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -243,7 +243,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/blenderbot/modeling_blenderbot.py b/src/transformers/models/blenderbot/modeling_blenderbot.py
index 2fbd4621361edd..4512c3b503a4be 100755
--- a/src/transformers/models/blenderbot/modeling_blenderbot.py
+++ b/src/transformers/models/blenderbot/modeling_blenderbot.py
@@ -252,7 +252,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-BLENDERBOT_ATTENTION_CLASSES = {"default": BlenderbotAttention}
+BLENDERBOT_ATTENTION_CLASSES = {"eager": BlenderbotAttention}
 
 
 # Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Blenderbot, MBART->BLENDERBOT
@@ -260,9 +260,8 @@ class BlenderbotEncoderLayer(nn.Module):
     def __init__(self, config: BlenderbotConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -332,9 +331,8 @@ class BlenderbotDecoderLayer(nn.Module):
     def __init__(self, config: BlenderbotConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -347,7 +345,7 @@ def __init__(self, config: BlenderbotConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = BLENDERBOT_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
index 1669602832d866..dc4fa30b875ef2 100755
--- a/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
@@ -254,9 +254,8 @@ class BlenderbotSmallEncoderLayer(nn.Module):
     def __init__(self, config: BlenderbotSmallConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -321,7 +320,10 @@ def forward(
         return outputs
 
 
-BLENDERBOT_SMALL_ATTENTION_CLASSES = {"default": BlenderbotSmallAttention}
+# TODO: Implement attention with SDPA for TimeSeriesTransformer.
+BLENDERBOT_SMALL_ATTENTION_CLASSES = {
+    "eager": BlenderbotSmallAttention,
+}
 
 
 # Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->BlenderbotSmall, BART->BLENDERBOT_SMALL
@@ -330,8 +332,7 @@ def __init__(self, config: BlenderbotSmallConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[attn_type](
+        self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -344,7 +345,7 @@ def __init__(self, config: BlenderbotSmallConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/blip/image_processing_blip.py b/src/transformers/models/blip/image_processing_blip.py
index fd8873cb7a45c8..d915c5e48b3f56 100644
--- a/src/transformers/models/blip/image_processing_blip.py
+++ b/src/transformers/models/blip/image_processing_blip.py
@@ -57,7 +57,7 @@ class BlipImageProcessor(BaseImageProcessor):
             Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
             overridden by the `resample` parameter in the `preprocess` method.
         do_rescale (`bool`, *optional*, defaults to `True`):
-            Wwhether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
             `do_rescale` parameter in the `preprocess` method.
         rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
             Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
diff --git a/src/transformers/models/bloom/modeling_bloom.py b/src/transformers/models/bloom/modeling_bloom.py
index 3064d5bae69673..ad1e87ec2c26e2 100644
--- a/src/transformers/models/bloom/modeling_bloom.py
+++ b/src/transformers/models/bloom/modeling_bloom.py
@@ -489,7 +489,7 @@ def _convert_to_standard_cache(
 
     @staticmethod
     def _convert_to_bloom_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the cache to the format expected by Bloom, i.e. to tuple(tuple([batch_size * num_heads, ...]))
diff --git a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
index 908fef5927af02..d422bc45ab3de0 100644
--- a/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
+++ b/src/transformers/models/clap/convert_clap_original_pytorch_to_hf.py
@@ -16,8 +16,7 @@
 import argparse
 import re
 
-import torch
-from CLAP import create_model
+from laion_clap import CLAP_Module
 
 from transformers import AutoFeatureExtractor, ClapConfig, ClapModel
 
@@ -38,17 +37,25 @@
 processor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused", truncation="rand_trunc")
 
 
-def init_clap(checkpoint_path, enable_fusion=False):
-    model, model_cfg = create_model(
-        "HTSAT-tiny",
-        "roberta",
-        checkpoint_path,
-        precision="fp32",
-        device="cuda:0" if torch.cuda.is_available() else "cpu",
+def init_clap(checkpoint_path, model_type, enable_fusion=False):
+    model = CLAP_Module(
+        amodel=model_type,
         enable_fusion=enable_fusion,
-        fusion_type="aff_2d" if enable_fusion else None,
     )
-    return model, model_cfg
+    model.load_ckpt(checkpoint_path)
+    return model
+
+
+def get_config_from_original(clap_model):
+    audio_config = {
+        "patch_embeds_hidden_size": clap_model.model.audio_branch.embed_dim,
+        "depths": clap_model.model.audio_branch.depths,
+        "hidden_size": clap_model.model.audio_projection[0].in_features,
+    }
+
+    text_config = {"hidden_size": clap_model.model.text_branch.pooler.dense.in_features}
+
+    return ClapConfig(audio_config=audio_config, text_config=text_config)
 
 
 def rename_state_dict(state_dict):
@@ -94,14 +101,14 @@ def rename_state_dict(state_dict):
     return model_state_dict
 
 
-def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, enable_fusion=False):
-    clap_model, clap_model_cfg = init_clap(checkpoint_path, enable_fusion=enable_fusion)
+def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_path, model_type, enable_fusion=False):
+    clap_model = init_clap(checkpoint_path, model_type, enable_fusion=enable_fusion)
 
     clap_model.eval()
-    state_dict = clap_model.state_dict()
+    state_dict = clap_model.model.state_dict()
     state_dict = rename_state_dict(state_dict)
 
-    transformers_config = ClapConfig()
+    transformers_config = get_config_from_original(clap_model)
     transformers_config.audio_config.enable_fusion = enable_fusion
     model = ClapModel(transformers_config)
 
@@ -118,6 +125,9 @@ def convert_clap_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_pa
     parser.add_argument("--checkpoint_path", default=None, type=str, help="Path to fairseq checkpoint")
     parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
     parser.add_argument("--enable_fusion", action="store_true", help="Whether to enable fusion or not")
+    parser.add_argument("--model_type", default="HTSAT-tiny", type=str, help="Whether to enable fusion or not")
     args = parser.parse_args()
 
-    convert_clap_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.enable_fusion)
+    convert_clap_checkpoint(
+        args.checkpoint_path, args.pytorch_dump_folder_path, args.config_path, args.model_type, args.enable_fusion
+    )
diff --git a/src/transformers/models/clap/modeling_clap.py b/src/transformers/models/clap/modeling_clap.py
index bea7cf2b93ccbd..b2997e1d49353f 100644
--- a/src/transformers/models/clap/modeling_clap.py
+++ b/src/transformers/models/clap/modeling_clap.py
@@ -92,6 +92,7 @@ def window_partition(hidden_states, window_size):
 # Adapted from https://github.com/LAION-AI/CLAP/blob/6ad05a971ba0622f6acee8c41993e0d02bbed639/src/open_clip/htsat.py#L263
 def window_reverse(windows, window_size, height, width):
     """
+    Merges windows to produce higher resolution features.
     Args:
         windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
             Input windows
@@ -102,11 +103,10 @@ def window_reverse(windows, window_size, height, width):
         width (`int`):
             Width of the resized audio
     """
-    batch_size = int(windows.shape[0] / (height * width / window_size / window_size))
-
-    hidden_states = windows.view(batch_size, height // window_size, width // window_size, window_size, window_size, -1)
-    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, height, width, -1)
-    return hidden_states
+    num_channels = windows.shape[-1]
+    windows = windows.view(-1, height // window_size, width // window_size, window_size, window_size, num_channels)
+    windows = windows.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, height, width, num_channels)
+    return windows
 
 
 # Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
diff --git a/src/transformers/models/clip/image_processing_clip.py b/src/transformers/models/clip/image_processing_clip.py
index df9628a66280b7..2c829d0aab948a 100644
--- a/src/transformers/models/clip/image_processing_clip.py
+++ b/src/transformers/models/clip/image_processing_clip.py
@@ -84,10 +84,6 @@ class CLIPImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -105,12 +101,11 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
         crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
 
@@ -125,7 +120,10 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
-        self.use_square_size = use_square_size
+
+        # for backwards compatibility of KOSMOS-2
+        if "use_square_size" in kwargs:
+            self.size = {"height": size["shortest_edge"], "width": size["shortest_edge"]}
 
     def resize(
         self,
@@ -152,13 +150,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -242,7 +246,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/clip/modeling_clip.py b/src/transformers/models/clip/modeling_clip.py
index 9e708e51056aac..77d24a5da32518 100644
--- a/src/transformers/models/clip/modeling_clip.py
+++ b/src/transformers/models/clip/modeling_clip.py
@@ -870,6 +870,7 @@ def forward(
 class CLIPVisionModel(CLIPPreTrainedModel):
     config_class = CLIPVisionConfig
     main_input_name = "pixel_values"
+    _no_split_modules = ["CLIPEncoderLayer"]
 
     def __init__(self, config: CLIPVisionConfig):
         super().__init__(config)
diff --git a/src/transformers/models/clvp/modeling_clvp.py b/src/transformers/models/clvp/modeling_clvp.py
index db2bbe3f00ea1e..64c6927e4a44ba 100644
--- a/src/transformers/models/clvp/modeling_clvp.py
+++ b/src/transformers/models/clvp/modeling_clvp.py
@@ -81,8 +81,7 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
 
 
-# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+def apply_rotary_pos_emb(q, k, v, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
 
     Args:
@@ -107,7 +106,51 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     sin = sin[position_ids].unsqueeze(unsqueeze_dim)
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
-    return q_embed, k_embed
+    v_embed = (v * cos) + (rotate_half(v) * sin)
+    return q_embed, k_embed, v_embed
+
+
+def _pad_extra_bos_eos_tokens(
+    input_ids,
+    attention_mask=None,
+    pad_token_id=0,
+    bos_token_id=255,
+    eos_token_id=0,
+    add_bos_token=True,
+    add_eos_token=True,
+):
+    """
+    This method adds extra bos and eos tokens to input_ids and accordingly modifies the attention_mask which is used in
+    `ClvpConditioningEncoder` and the generation loop of the `ClvpModelForConditionalGeneration`.
+    """
+
+    # add the bos token at the beginning
+    if add_bos_token:
+        input_ids = torch.nn.functional.pad(input_ids, (1, 0), value=bos_token_id)
+        attention_mask = (
+            torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
+        )
+
+    modified_input_ids = input_ids
+    if add_eos_token:
+        modified_input_ids = torch.zeros(
+            (input_ids.shape[0], input_ids.shape[1] + 1), dtype=input_ids.dtype, device=input_ids.device
+        )
+        for i, each_input_id in enumerate(input_ids):
+            # locate where the valid tokens end and then add the eos token
+            if torch.isin(each_input_id, pad_token_id).sum():
+                pos = torch.where(each_input_id == pad_token_id)[0].min()
+                modified_input_ids[i] = torch.concatenate(
+                    [each_input_id[:pos], torch.tensor([eos_token_id], device=input_ids.device), each_input_id[pos:]]
+                )
+            else:
+                # if there are no pad tokens present, then add eos to the end
+                modified_input_ids[i] = torch.nn.functional.pad(each_input_id, (0, 1), value=eos_token_id)
+        attention_mask = (
+            torch.nn.functional.pad(attention_mask, (1, 0), value=1) if attention_mask is not None else attention_mask
+        )
+
+    return modified_input_ids, attention_mask
 
 
 @dataclass
@@ -312,13 +355,18 @@ def forward(
                 key_states[..., :rotary_emb_dim],
                 key_states[..., rotary_emb_dim:],
             )
+            value_rot, value_pass = (
+                value_states[..., :rotary_emb_dim],
+                value_states[..., rotary_emb_dim:],
+            )
 
             cos, sin = rotary_pos_emb.cos().squeeze(0), rotary_pos_emb.sin().squeeze(0)
-            query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+            query_rot, key_rot, value_rot = apply_rotary_pos_emb(query_rot, key_rot, value_rot, cos, sin, position_ids)
 
             # [batch_size, num_heads, seq_length, head_dim]
             query_states = torch.cat((query_rot, query_pass), dim=-1)
             key_states = torch.cat((key_rot, key_pass), dim=-1)
+            value_states = torch.cat((value_rot, value_pass), dim=-1)
 
         tgt_len = query_states.shape[2]
         src_len = key_states.shape[2]
@@ -599,16 +647,7 @@ def forward(
         if input_ids is not None and inputs_embeds is not None:
             raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
         elif input_ids is not None:
-            # We add bos and eos input_ids in the modeling file instead of the tokenizer file to keep the logic simple
-            # This logic is specific to ClvpConditioningEncoder and not used by other modules.
-            input_ids = torch.nn.functional.pad(input_ids, (1, 0), value=self.text_config.bos_token_id)
-            input_ids = torch.nn.functional.pad(input_ids, (0, 1), value=self.text_config.eos_token_id)
             batch_size, seq_length = input_ids.size()
-            inputs_embeds = self.text_token_embedding(input_ids)
-            # check if we need to update attention mask, if yes then pad it too
-            if attention_mask is not None and attention_mask.shape[1] != seq_length:
-                attention_mask = torch.nn.functional.pad(attention_mask, (1, 0), value=1)
-                attention_mask = torch.nn.functional.pad(attention_mask, (0, 1), value=1)
         elif inputs_embeds is not None:
             batch_size, seq_length = inputs_embeds.size()[:-1]
         else:
@@ -616,8 +655,18 @@ def forward(
 
         # construct attention mask if not given
         if attention_mask is None:
-            attention_mask = torch.ones([batch_size, seq_length], dtype=torch.long, device=inputs_embeds.device)
+            attention_mask = torch.ones([batch_size, seq_length], dtype=torch.long, device=input_ids.device)
+
+        # We add bos and eos input_ids in the modeling file instead of the tokenizer file to keep the logic simple
+        # This logic is specific to ClvpConditioningEncoder and not used by other modules.
+        input_ids, attention_mask = _pad_extra_bos_eos_tokens(
+            input_ids,
+            attention_mask,
+            bos_token_id=self.text_config.bos_token_id,
+            eos_token_id=self.text_config.eos_token_id,
+        )
 
+        inputs_embeds = self.text_token_embedding(input_ids)
         position_ids = attention_mask.cumsum(-1) - 1
         position_embeds = self.text_position_embedding(position_ids)
         text_embeds = inputs_embeds + position_embeds
@@ -1512,10 +1561,6 @@ def fix_speech_decoder_output(self, speech_ids: torch.LongTensor) -> torch.LongT
         """
         decoder_fixing_codes = self.config.decoder_config.decoder_fixing_codes
         speech_ids = speech_ids[:, 1:]
-        if torch.isin(self.speech_decoder_model.config.eos_token_id, speech_ids):
-            speech_ids = torch.nn.functional.pad(
-                speech_ids, pad=(0, 1), value=self.speech_decoder_model.config.eos_token_id
-            )
 
         stop_token_indices = torch.where(speech_ids == self.speech_decoder_model.config.eos_token_id, 1, 0)
         speech_ids = torch.masked_fill(speech_ids, mask=stop_token_indices.bool(), value=decoder_fixing_codes[0])
@@ -1828,6 +1873,7 @@ def generate(
         input_features: torch.FloatTensor = None,
         attention_mask: Optional[torch.LongTensor] = None,
         generation_config: Optional[GenerationConfig] = None,
+        pad_to_max_mel_tokens: Optional[int] = None,
         output_hidden_states: Optional[bool] = None,
         **kwargs,
     ):
@@ -1855,6 +1901,11 @@ def generate(
                 priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                 configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                 default values, whose documentation should be checked to parameterize generation.
+            pad_to_max_mel_tokens (`int`, *optional*):
+                Pads generated speech_ids to the specified value. This is to implement the same logic from the official
+                repo, link: https://github.com/neonbjb/tortoise-tts/blob/80f89987a5abda5e2b082618cd74f9c7411141dc/tortoise/api.py#L430
+                and to make sure the logits are same.
+                This does not affect generation quality so please don't consider using it since it is less efficient.
             output_hidden_states (`bool`, *optional*):
                 Whether or not to return the hidden states of decoder model, text encoder and speech encoder models.
 
@@ -1862,6 +1913,17 @@ def generate(
             `ClvpOutput` or tuple: A `ClvpOutput` (if `return_dict_in_generate=True` or when
             `config.return_dict_in_generate=True`) or a tuple.
         """
+
+        # If the input sequences are larger than (self.config.decoder_config.max_text_tokens - 3) then raise error,
+        # because we need to add 3 tokens ( 1 bos tokens and 2 eos tokens) to the input_ids in ClvpConditioningEncoder to
+        # properly sample
+        sequence_length = input_ids.shape[-1]
+        if sequence_length > (self.config.decoder_config.max_text_tokens - 3):
+            raise ValueError(
+                f"Maximum sequence length reached! Found input_ids of length {sequence_length}."
+                f"Please make sure that the maximum length of input_ids is {self.config.decoder_config.max_text_tokens - 3}"
+            )
+
         if generation_config is None:
             generation_config = self.generation_config
 
@@ -1870,6 +1932,16 @@ def generate(
         generation_config.validate()
         self._validate_model_kwargs(model_kwargs.copy())
 
+        # pad input_ids as specified in the original repo
+        # link: https://github.com/neonbjb/tortoise-tts/blob/80f89987a5abda5e2b082618cd74f9c7411141dc/tortoise/api.py#L380
+        input_ids, attention_mask = _pad_extra_bos_eos_tokens(
+            input_ids,
+            attention_mask,
+            add_bos_token=False,
+            bos_token_id=self.config.text_config.bos_token_id,
+            eos_token_id=self.config.text_config.eos_token_id,
+        )
+
         conditioning_embeds = self.conditioning_encoder(
             input_features=input_features,
             input_ids=input_ids,
@@ -1884,6 +1956,15 @@ def generate(
         )
         if isinstance(decoder_outputs, ModelOutput):
             speech_ids = decoder_outputs.sequences
+
+        # pad to pad_to_max_mel_tokens if given, to replicate the original repo logic
+        # link: https://github.com/neonbjb/tortoise-tts/blob/80f89987a5abda5e2b082618cd74f9c7411141dc/tortoise/api.py#L430
+        if pad_to_max_mel_tokens is not None:
+            padding_needed = pad_to_max_mel_tokens - speech_ids.shape[-1]
+            speech_ids = torch.nn.functional.pad(
+                speech_ids, (0, padding_needed), value=self.generation_config.eos_token_id
+            )
+
         speech_ids = self.fix_speech_decoder_output(speech_ids)
 
         speech_outputs = self.speech_encoder_model(
diff --git a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
index c46527971f1ab5..23e493e08bf105 100644
--- a/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/image_processing_conditional_detr.py
@@ -37,6 +37,9 @@
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -45,12 +48,10 @@
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -80,15 +81,8 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
 
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -332,10 +326,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]
@@ -799,7 +796,7 @@ class ConditionalDetrImageProcessor(BaseImageProcessor):
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -858,7 +855,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -868,12 +865,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1115,7 +1112,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1161,7 +1158,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1228,28 +1225,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
@@ -1348,8 +1330,8 @@ def preprocess(
     # POSTPROCESSING METHODS - TODO: add support for other frameworks
     def post_process(self, outputs, target_sizes):
         """
-        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the COCO api. Only
-        supports PyTorch.
+        Converts the output of [`ConditionalDetrForObjectDetection`] into the format expected by the Pascal VOC format (xmin, ymin, xmax, ymax).
+        Only supports PyTorch.
 
         Args:
             outputs ([`ConditionalDetrObjectDetectionOutput`]):
diff --git a/src/transformers/models/conditional_detr/modeling_conditional_detr.py b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
index fd9a9d7e6f2025..d903abffafb455 100644
--- a/src/transformers/models/conditional_detr/modeling_conditional_detr.py
+++ b/src/transformers/models/conditional_detr/modeling_conditional_detr.py
@@ -1805,7 +1805,7 @@ def forward(
 
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/models/convnext/modeling_convnext.py b/src/transformers/models/convnext/modeling_convnext.py
index a0102b47ce8a21..a952e5d8165e15 100755
--- a/src/transformers/models/convnext/modeling_convnext.py
+++ b/src/transformers/models/convnext/modeling_convnext.py
@@ -529,14 +529,13 @@ def forward(
         outputs = self.encoder(
             embedding_output,
             output_hidden_states=True,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
-        hidden_states = outputs.hidden_states
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
 
         feature_maps = ()
-        # we skip the stem
-        for idx, (stage, hidden_state) in enumerate(zip(self.stage_names[1:], hidden_states[1:])):
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
             if stage in self.out_features:
                 hidden_state = self.hidden_states_norms[stage](hidden_state)
                 feature_maps += (hidden_state,)
@@ -544,11 +543,11 @@ def forward(
         if not return_dict:
             output = (feature_maps,)
             if output_hidden_states:
-                output += (outputs.hidden_states,)
+                output += (hidden_states,)
             return output
 
         return BackboneOutput(
             feature_maps=feature_maps,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            hidden_states=hidden_states if output_hidden_states else None,
             attentions=None,
         )
diff --git a/src/transformers/models/convnextv2/modeling_convnextv2.py b/src/transformers/models/convnextv2/modeling_convnextv2.py
index 07580731ea1a06..8d166200d12253 100644
--- a/src/transformers/models/convnextv2/modeling_convnextv2.py
+++ b/src/transformers/models/convnextv2/modeling_convnextv2.py
@@ -552,14 +552,13 @@ def forward(
         outputs = self.encoder(
             embedding_output,
             output_hidden_states=True,
-            return_dict=True,
+            return_dict=return_dict,
         )
 
-        hidden_states = outputs.hidden_states
+        hidden_states = outputs.hidden_states if return_dict else outputs[1]
 
         feature_maps = ()
-        # we skip the stem
-        for idx, (stage, hidden_state) in enumerate(zip(self.stage_names[1:], hidden_states[1:])):
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
             if stage in self.out_features:
                 hidden_state = self.hidden_states_norms[stage](hidden_state)
                 feature_maps += (hidden_state,)
@@ -567,11 +566,11 @@ def forward(
         if not return_dict:
             output = (feature_maps,)
             if output_hidden_states:
-                output += (outputs.hidden_states,)
+                output += (hidden_states,)
             return output
 
         return BackboneOutput(
             feature_maps=feature_maps,
-            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            hidden_states=hidden_states if output_hidden_states else None,
             attentions=None,
         )
diff --git a/src/transformers/models/data2vec/modeling_data2vec_vision.py b/src/transformers/models/data2vec/modeling_data2vec_vision.py
index b749b7bf154840..77c9363fa217c4 100644
--- a/src/transformers/models/data2vec/modeling_data2vec_vision.py
+++ b/src/transformers/models/data2vec/modeling_data2vec_vision.py
@@ -150,22 +150,26 @@ def __init__(self, config: Data2VecVisionConfig) -> None:
         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 
     def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.BoolTensor] = None) -> torch.Tensor:
-        embeddings = self.patch_embeddings(pixel_values)
+        embeddings, (patch_height, patch_width) = self.patch_embeddings(
+            pixel_values, self.position_embeddings[:, 1:, :] if self.position_embeddings is not None else None
+        )
         batch_size, seq_len, _ = embeddings.size()
 
-        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         if bool_masked_pos is not None:
             mask_tokens = self.mask_token.expand(batch_size, seq_len, -1)
             # replace the masked visual tokens by mask_tokens
             w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
             embeddings = embeddings * (1 - w) + mask_tokens * w
 
-        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
         if self.position_embeddings is not None:
-            embeddings = embeddings + self.position_embeddings
+            cls_tokens = cls_tokens + self.position_embeddings[:, :1, :]
+
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
         embeddings = self.dropout(embeddings)
 
-        return embeddings
+        return embeddings, (patch_height, patch_width)
 
 
 # Copied from transformers.models.beit.modeling_beit.BeitPatchEmbeddings with Beit->Data2VecVision
@@ -193,19 +197,29 @@ def __init__(self, config):
 
         self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
 
-    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+    def forward(self, pixel_values: torch.Tensor, position_embedding: Optional[torch.Tensor] = None) -> torch.Tensor:
         batch_size, num_channels, height, width = pixel_values.shape
         if num_channels != self.num_channels:
             raise ValueError(
                 "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
             )
-        if height != self.image_size[0] or width != self.image_size[1]:
-            raise ValueError(
-                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+
+        embeddings = self.projection(pixel_values)
+        patch_height, patch_width = embeddings.shape[2], embeddings.shape[3]
+
+        if position_embedding is not None:
+            # interpolate the position embedding to the corresponding size
+            position_embedding = position_embedding.view(1, self.patch_shape[0], self.patch_shape[1], -1).permute(
+                0, 3, 1, 2
+            )
+            position_embedding = nn.functional.interpolate(
+                position_embedding, size=(patch_height, patch_width), mode="bicubic"
             )
-        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+            embeddings = embeddings + position_embedding
 
-        return embeddings
+        embeddings = embeddings.flatten(2).transpose(1, 2)
+
+        return embeddings, (patch_height, patch_width)
 
 
 # Copied from transformers.models.beit.modeling_beit.BeitSelfAttention with Beit->Data2VecVision
@@ -683,7 +697,7 @@ def forward(
         # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
-        embedding_output = self.embeddings(pixel_values, bool_masked_pos)
+        embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values, bool_masked_pos)
 
         encoder_outputs = self.encoder(
             embedding_output,
@@ -1079,6 +1093,12 @@ def __init__(self, config: Data2VecVisionConfig) -> None:
         self.data2vec_vision = Data2VecVisionModel(config, add_pooling_layer=False)
 
         # FPNs
+        if len(self.config.out_indices) != 4:
+            raise ValueError(
+                "Data2VecVisionForSemanticSegmentation requires config.out_indices to be a list of 4 integers, "
+                "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of "
+                "a base-sized architecture."
+            )
         self.fpn1 = nn.Sequential(
             nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2),
             nn.BatchNorm2d(config.hidden_size),
diff --git a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
index 4e83488d75b435..00cf8eaecfa308 100644
--- a/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/image_processing_deformable_detr.py
@@ -37,6 +37,9 @@
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -45,12 +48,10 @@
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -79,15 +80,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -331,10 +324,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]
@@ -798,7 +794,7 @@ class DeformableDetrImageProcessor(BaseImageProcessor):
     # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -857,7 +853,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -867,12 +863,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1114,7 +1110,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1160,7 +1156,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1227,28 +1223,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
index 2a93fc2aa0e8e6..3767eef0392f6a 100755
--- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py
+++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py
@@ -1900,7 +1900,7 @@ def forward(
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
index e4607d56f7061b..2e9055a935aa52 100644
--- a/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
+++ b/src/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -83,7 +83,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -113,7 +113,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -140,7 +140,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -860,7 +860,6 @@ def _reorder_cache(past_key_values, beam_idx):
     """,
     OPEN_LLAMA_START_DOCSTRING,
 )
-# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with LLAMA->OPEN_LLAMA,Llama->OpenLlama
 class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
diff --git a/src/transformers/models/transfo_xl/__init__.py b/src/transformers/models/deprecated/transfo_xl/__init__.py
similarity index 96%
rename from src/transformers/models/transfo_xl/__init__.py
rename to src/transformers/models/deprecated/transfo_xl/__init__.py
index ce4215b0217bae..f3674e19665ca7 100644
--- a/src/transformers/models/transfo_xl/__init__.py
+++ b/src/transformers/models/deprecated/transfo_xl/__init__.py
@@ -14,7 +14,7 @@
 
 from typing import TYPE_CHECKING
 
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
+from ....utils import OptionalDependencyNotAvailable, _LazyModule, is_tf_available, is_torch_available
 
 
 _import_structure = {
diff --git a/src/transformers/models/transfo_xl/configuration_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
similarity index 97%
rename from src/transformers/models/transfo_xl/configuration_transfo_xl.py
rename to src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
index 8550e71802867a..842c1643a00b26 100644
--- a/src/transformers/models/transfo_xl/configuration_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -15,8 +15,8 @@
 # limitations under the License.
 """ Transformer XL configuration"""
 
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
 
 
 logger = logging.get_logger(__name__)
@@ -74,7 +74,7 @@ class TransfoXLConfig(PretrainedConfig):
             Whether or not to use adaptive softmax.
         dropout (`float`, *optional*, defaults to 0.1):
             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        dropatt (`float`, *optional*, defaults to 0):
+        dropatt (`float`, *optional*, defaults to 0.0):
             The dropout ratio for the attention probabilities.
         untie_r (`boolean`, *optional*, defaults to `True`):
             Whether ot not to untie relative position biases.
@@ -86,8 +86,10 @@ class TransfoXLConfig(PretrainedConfig):
             Parameters initialized by N(0, init_std)
         init_std (`float`, *optional*, defaults to 0.02):
             Parameters initialized by N(0, init_std)
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
             The epsilon to use in the layer normalization layers
+        eos_token_id (`int`, *optional*, defaults to 0):
+            End of stream token id.
 
     Examples:
 
diff --git a/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
old mode 100755
new mode 100644
similarity index 95%
rename from src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
rename to src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
index 646c8a2342fc3a..d2693ac333b84b
--- a/src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+++ b/src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
@@ -23,8 +23,8 @@
 import torch
 
 from transformers import TransfoXLConfig, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl
-from transformers.models.transfo_xl import tokenization_transfo_xl as data_utils
-from transformers.models.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
+from transformers.models.deprecated.transfo_xl import tokenization_transfo_xl as data_utils
+from transformers.models.deprecated.transfo_xl.tokenization_transfo_xl import CORPUS_NAME, VOCAB_FILES_NAMES
 from transformers.utils import CONFIG_NAME, WEIGHTS_NAME, logging
 
 
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
similarity index 97%
rename from src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
rename to src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
index 88005b7e0600d9..9ae32f8cebaaea 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -25,7 +25,7 @@
 import numpy as np
 import tensorflow as tf
 
-from ...modeling_tf_utils import (
+from ....modeling_tf_utils import (
     TFModelInputType,
     TFPreTrainedModel,
     TFSequenceClassificationLoss,
@@ -33,8 +33,8 @@
     keras_serializable,
     unpack_inputs,
 )
-from ...tf_utils import shape_list, stable_softmax
-from ...utils import (
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
@@ -987,6 +987,21 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model
 
         return inputs
 
+    # Adapted from the torch tie_weights function
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight:
+            return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers")
+        elif self.config.tie_projs and "crit.out_projs" in tf_weight:
+            for i, tie_proj in enumerate(self.config.tie_projs):
+                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0")
+                elif tie_proj and self.config.div_val != 1:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                    return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs")
+        else:
+            return (tf_weight,)
+
 
 @add_start_docstrings(
     """
diff --git a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
similarity index 99%
rename from src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
rename to src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
index dcfa84d0f94b69..c6a380842e480d 100644
--- a/src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -20,7 +20,7 @@
 
 import tensorflow as tf
 
-from ...tf_utils import shape_list
+from ....tf_utils import shape_list
 
 
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
similarity index 99%
rename from src/transformers/models/transfo_xl/modeling_transfo_xl.py
rename to src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
index 1e944c335ae7d0..2d343bd71571e8 100644
--- a/src/transformers/models/transfo_xl/modeling_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -25,8 +25,8 @@
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
-from ...modeling_utils import PreTrainedModel
-from ...utils import (
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
     ModelOutput,
     add_code_sample_docstrings,
     add_start_docstrings,
diff --git a/src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py b/src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
similarity index 100%
rename from src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
rename to src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
diff --git a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
similarity index 94%
rename from src/transformers/models/transfo_xl/tokenization_transfo_xl.py
rename to src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
index 8a2aba92f7a828..cea74e76bc15a6 100644
--- a/src/transformers/models/transfo_xl/tokenization_transfo_xl.py
+++ b/src/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -27,13 +27,14 @@
 
 import numpy as np
 
-from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import (
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import (
     cached_file,
     is_sacremoses_available,
     is_torch_available,
     logging,
     requires_backends,
+    strtobool,
     torch_only_method,
 )
 
@@ -181,6 +182,12 @@ def __init__(
         language="en",
         **kwargs,
     ):
+        logger.error(
+            "`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. "
+            "See more details on this model's documentation page: "
+            "`https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`."
+        )
+
         requires_backends(self, "sacremoses")
         if special is None:
             special = []
@@ -206,6 +213,14 @@ def __init__(
             vocab_dict = None
             if pretrained_vocab_file is not None:
                 # Priority on pickle files (support PyTorch and TF)
+                if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+                    raise ValueError(
+                        "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                        "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                        "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                        "data and decided to use it, you can set the environment variable "
+                        "`TRUST_REMOTE_CODE` to `True` to allow it."
+                    )
                 with open(pretrained_vocab_file, "rb") as f:
                     vocab_dict = pickle.load(f)
 
@@ -223,7 +238,7 @@ def __init__(
 
             if vocab_dict is not None:
                 for key, value in vocab_dict.items():
-                    if key not in self.__dict__ or key == "sym2idx":
+                    if key not in self.__dict__ or key in ["sym2idx", "idx2sym"]:
                         self.__dict__[key] = value
             elif vocab_file is not None:
                 self.build_vocab()
@@ -784,6 +799,13 @@ def get_lm_corpus(datadir, dataset):
         corpus = torch.load(fn_pickle)
     elif os.path.exists(fn):
         logger.info("Loading cached dataset from pickle...")
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
+                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
+                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
+                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
+            )
         with open(fn, "rb") as fp:
             corpus = pickle.load(fp)
     else:
diff --git a/src/transformers/models/deta/image_processing_deta.py b/src/transformers/models/deta/image_processing_deta.py
index 36c2c2e3812788..1e3ece8e324ad0 100644
--- a/src/transformers/models/deta/image_processing_deta.py
+++ b/src/transformers/models/deta/image_processing_deta.py
@@ -34,6 +34,8 @@
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -42,9 +44,8 @@
     is_batched,
     is_scaled_image,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
     is_flax_available,
@@ -57,7 +58,7 @@
     is_vision_available,
     logging,
 )
-from ...utils.generic import ExplicitEnum, TensorType
+from ...utils.generic import TensorType
 
 
 if is_torch_available():
@@ -73,13 +74,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio
@@ -323,10 +318,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]
@@ -504,7 +502,7 @@ class DetaImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -539,7 +537,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -549,12 +547,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -786,7 +784,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -800,12 +798,12 @@ def preprocess(
                 Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
                 from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             annotations (`List[Dict]` or `List[List[Dict]]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -832,7 +830,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -891,28 +889,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py
index 0f245f2a3058f4..8362b49eee30f2 100644
--- a/src/transformers/models/deta/modeling_deta.py
+++ b/src/transformers/models/deta/modeling_deta.py
@@ -1414,14 +1414,12 @@ def get_encoder(self):
     def get_decoder(self):
         return self.decoder
 
-    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.freeze_backbone
     def freeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
+        for name, param in self.backbone.model.named_parameters():
             param.requires_grad_(False)
 
-    # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.unfreeze_backbone
     def unfreeze_backbone(self):
-        for name, param in self.backbone.conv_encoder.model.named_parameters():
+        for name, param in self.backbone.model.named_parameters():
             param.requires_grad_(True)
 
     # Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrModel.get_valid_ratio
@@ -1853,7 +1851,7 @@ def forward(
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/models/detr/image_processing_detr.py b/src/transformers/models/detr/image_processing_detr.py
index 995c33fcdb9a29..8b64b9c4d9a46b 100644
--- a/src/transformers/models/detr/image_processing_detr.py
+++ b/src/transformers/models/detr/image_processing_detr.py
@@ -36,6 +36,9 @@
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -44,12 +47,10 @@
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -79,15 +80,7 @@
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]:
@@ -326,10 +319,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]
@@ -782,7 +778,7 @@ class DetrImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -839,7 +835,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -849,12 +845,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1086,7 +1082,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1132,7 +1128,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1199,28 +1195,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/detr/modeling_detr.py b/src/transformers/models/detr/modeling_detr.py
index 9b73d0e6516b6c..e0680e1874964b 100644
--- a/src/transformers/models/detr/modeling_detr.py
+++ b/src/transformers/models/detr/modeling_detr.py
@@ -1535,7 +1535,7 @@ def forward(
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/models/dinov2/modeling_dinov2.py b/src/transformers/models/dinov2/modeling_dinov2.py
index e6a17e570743d8..66bac639f6731b 100644
--- a/src/transformers/models/dinov2/modeling_dinov2.py
+++ b/src/transformers/models/dinov2/modeling_dinov2.py
@@ -105,7 +105,7 @@ def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width:
         patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
         patch_pos_embed = nn.functional.interpolate(
             patch_pos_embed,
-            scale_factor=(height / math.sqrt(num_positions), width / math.sqrt(num_positions)),
+            scale_factor=(float(height / math.sqrt(num_positions)), float(width / math.sqrt(num_positions))),
             mode="bicubic",
             align_corners=False,
         )
diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py
index 144fde42e0bf00..6e38ee84e98f6c 100755
--- a/src/transformers/models/distilbert/modeling_distilbert.py
+++ b/src/transformers/models/distilbert/modeling_distilbert.py
@@ -46,6 +46,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -269,6 +270,15 @@ class DistilBertFlashAttention2(MultiHeadSelfAttention):
     API of flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         query: torch.Tensor,
@@ -363,6 +373,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -383,13 +399,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -455,6 +471,12 @@ def ff_chunk(self, input: torch.Tensor) -> torch.Tensor:
         return x
 
 
+DISTILBERT_ATTENTION_CLASSES = {
+    "eager": MultiHeadSelfAttention,
+    "flash_attention_2": DistilBertFlashAttention2,
+}
+
+
 class TransformerBlock(nn.Module):
     def __init__(self, config: PretrainedConfig):
         super().__init__()
@@ -463,11 +485,7 @@ def __init__(self, config: PretrainedConfig):
         if config.dim % config.n_heads != 0:
             raise ValueError(f"config.n_heads {config.n_heads} must divide config.dim {config.dim} evenly")
 
-        self.attention = (
-            MultiHeadSelfAttention(config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else DistilBertFlashAttention2(config)
-        )
+        self.attention = DISTILBERT_ATTENTION_CLASSES[config._attn_implementation](config)
         self.sa_layer_norm = nn.LayerNorm(normalized_shape=config.dim, eps=1e-12)
 
         self.ffn = FFN(config)
@@ -687,6 +705,7 @@ def __init__(self, config: PretrainedConfig):
 
         self.embeddings = Embeddings(config)  # Embeddings
         self.transformer = Transformer(config)  # Encoder
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         # Initialize weights and apply final processing
         self.post_init()
@@ -792,7 +811,7 @@ def forward(
 
         embeddings = self.embeddings(input_ids, inputs_embeds)  # (bs, seq_length, dim)
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         else:
             if attention_mask is None:
diff --git a/src/transformers/models/dpt/convert_dpt_beit_to_hf.py b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
new file mode 100644
index 00000000000000..1e7d438e025872
--- /dev/null
+++ b/src/transformers/models/dpt/convert_dpt_beit_to_hf.py
@@ -0,0 +1,305 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert DPT 3.1 checkpoints from the MiDaS repository. URL: https://github.com/isl-org/MiDaS"""
+
+
+import argparse
+from pathlib import Path
+
+import requests
+import torch
+from PIL import Image
+
+from transformers import BeitConfig, DPTConfig, DPTForDepthEstimation, DPTImageProcessor
+from transformers.utils import logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+
+def get_dpt_config(model_name):
+    hidden_size = 768
+    num_hidden_layers = 12
+    num_attention_heads = 12
+    intermediate_size = 3072
+    out_features = ["stage3", "stage6", "stage9", "stage12"]  # beit-base-384 uses [2, 5, 8, 11]
+
+    if "large" in model_name:
+        hidden_size = 1024
+        num_hidden_layers = 24
+        num_attention_heads = 16
+        intermediate_size = 4096
+        out_features = ["stage6", "stage12", "stage18", "stage24"]  # beit-large-512 uses [5, 11, 17, 23]
+
+    if "512" in model_name:
+        image_size = 512
+    elif "384" in model_name:
+        image_size = 384
+    else:
+        raise ValueError("Model not supported")
+
+    backbone_config = BeitConfig(
+        image_size=image_size,
+        num_hidden_layers=num_hidden_layers,
+        hidden_size=hidden_size,
+        intermediate_size=intermediate_size,
+        num_attention_heads=num_attention_heads,
+        use_relative_position_bias=True,
+        reshape_hidden_states=False,
+        out_features=out_features,
+    )
+
+    neck_hidden_sizes = [256, 512, 1024, 1024] if "large" in model_name else [96, 192, 384, 768]
+    config = DPTConfig(backbone_config=backbone_config, neck_hidden_sizes=neck_hidden_sizes)
+
+    return config, image_size
+
+
+# here we list all keys to be renamed (original name on the left, our name on the right)
+def create_rename_keys(config):
+    rename_keys = []
+
+    # fmt: off
+    # stem
+    rename_keys.append(("pretrained.model.cls_token", "backbone.embeddings.cls_token"))
+    rename_keys.append(("pretrained.model.patch_embed.proj.weight", "backbone.embeddings.patch_embeddings.projection.weight"))
+    rename_keys.append(("pretrained.model.patch_embed.proj.bias", "backbone.embeddings.patch_embeddings.projection.bias"))
+
+    # Transfomer encoder
+    for i in range(config.backbone_config.num_hidden_layers):
+        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_1", f"backbone.encoder.layer.{i}.lambda_1"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.gamma_2", f"backbone.encoder.layer.{i}.lambda_2"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.weight", f"backbone.encoder.layer.{i}.layernorm_before.weight"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.norm1.bias", f"backbone.encoder.layer.{i}.layernorm_before.bias"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.weight", f"backbone.encoder.layer.{i}.layernorm_after.weight"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.norm2.bias", f"backbone.encoder.layer.{i}.layernorm_after.bias"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.weight", f"backbone.encoder.layer.{i}.intermediate.dense.weight"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc1.bias", f"backbone.encoder.layer.{i}.intermediate.dense.bias"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.weight", f"backbone.encoder.layer.{i}.output.dense.weight"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.mlp.fc2.bias", f"backbone.encoder.layer.{i}.output.dense.bias"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.weight", f"backbone.encoder.layer.{i}.attention.output.dense.weight"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.attn.proj.bias", f"backbone.encoder.layer.{i}.attention.output.dense.bias"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_bias_table", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"))
+        rename_keys.append((f"pretrained.model.blocks.{i}.attn.relative_position_index", f"backbone.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"))
+
+    # activation postprocessing (readout projections + resize blocks)
+    for i in range(4):
+        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.weight", f"neck.reassemble_stage.readout_projects.{i}.0.weight"))
+        rename_keys.append((f"pretrained.act_postprocess{i+1}.0.project.0.bias", f"neck.reassemble_stage.readout_projects.{i}.0.bias"))
+
+        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.weight", f"neck.reassemble_stage.layers.{i}.projection.weight"))
+        rename_keys.append((f"pretrained.act_postprocess{i+1}.3.bias", f"neck.reassemble_stage.layers.{i}.projection.bias"))
+
+        if i != 2:
+            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.weight", f"neck.reassemble_stage.layers.{i}.resize.weight"))
+            rename_keys.append((f"pretrained.act_postprocess{i+1}.4.bias", f"neck.reassemble_stage.layers.{i}.resize.bias"))
+
+    # refinenet (tricky here)
+    mapping = {1:3, 2:2, 3:1, 4:0}
+
+    for i in range(1, 5):
+        j = mapping[i]
+        rename_keys.append((f"scratch.refinenet{i}.out_conv.weight", f"neck.fusion_stage.layers.{j}.projection.weight"))
+        rename_keys.append((f"scratch.refinenet{i}.out_conv.bias", f"neck.fusion_stage.layers.{j}.projection.bias"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.weight"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution1.bias"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.weight"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit1.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer1.convolution2.bias"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.weight"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv1.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution1.bias"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.weight", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.weight"))
+        rename_keys.append((f"scratch.refinenet{i}.resConfUnit2.conv2.bias", f"neck.fusion_stage.layers.{j}.residual_layer2.convolution2.bias"))
+
+    # scratch convolutions
+    for i in range(4):
+        rename_keys.append((f"scratch.layer{i+1}_rn.weight", f"neck.convs.{i}.weight"))
+
+    # head
+    for i in range(0, 5, 2):
+        rename_keys.append((f"scratch.output_conv.{i}.weight", f"head.head.{i}.weight"))
+        rename_keys.append((f"scratch.output_conv.{i}.bias", f"head.head.{i}.bias"))
+
+    return rename_keys
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = ["pretrained.model.head.weight", "pretrained.model.head.bias"]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+# we split up the matrix of each encoder layer into queries, keys and values
+def read_in_q_k_v(state_dict, config):
+    hidden_size = config.backbone_config.hidden_size
+    for i in range(config.backbone_config.num_hidden_layers):
+        # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias)
+        in_proj_weight = state_dict.pop(f"pretrained.model.blocks.{i}.attn.qkv.weight")
+        q_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.q_bias")
+        v_bias = state_dict.pop(f"pretrained.model.blocks.{i}.attn.v_bias")
+        # next, add query, keys and values (in that order) to the state dict
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[:hidden_size, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.query.bias"] = q_bias
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
+            hidden_size : hidden_size * 2, :
+        ]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[-hidden_size:, :]
+        state_dict[f"backbone.encoder.layer.{i}.attention.attention.value.bias"] = v_bias
+
+
+def rename_key(dct, old, new):
+    val = dct.pop(old)
+    dct[new] = val
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    im = Image.open(requests.get(url, stream=True).raw)
+    return im
+
+
+@torch.no_grad()
+def convert_dpt_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub):
+    """
+    Copy/paste/tweak model's weights to our DPT structure.
+    """
+
+    name_to_url = {
+        "dpt-beit-large-512": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_512.pt",
+        "dpt-beit-large-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_large_384.pt",
+        "dpt-beit-base-384": "https://github.com/isl-org/MiDaS/releases/download/v3_1/dpt_beit_base_384.pt",
+    }
+
+    # define DPT configuration based on URL
+    checkpoint_url = name_to_url[model_name]
+    config, image_size = get_dpt_config(model_name)
+    # load original state_dict from URL
+    state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")
+    # remove certain keys
+    remove_ignore_keys_(state_dict)
+    # rename keys
+    rename_keys = create_rename_keys(config)
+    for src, dest in rename_keys:
+        rename_key(state_dict, src, dest)
+    # read in qkv matrices
+    read_in_q_k_v(state_dict, config)
+
+    # load HuggingFace model
+    model = DPTForDepthEstimation(config)
+    missing_keys, unexpected_keys = model.load_state_dict(state_dict, strict=False)
+    print("Missing keys:", missing_keys)
+    print("Unexpected keys:", unexpected_keys)
+    assert missing_keys == []
+    # assert unexpected_keys == ["pretrained.model.fc_norm.weight", "pretrained.model.fc_norm.bias"]
+    model.eval()
+
+    # Check outputs on an image
+    processor = DPTImageProcessor(
+        size={"height": image_size, "width": image_size}, keep_aspect_ratio=True, ensure_multiple_of=32
+    )
+
+    image = prepare_img()
+    pixel_values = processor(image, return_tensors="pt").pixel_values
+
+    print("First values of pixel values:", pixel_values[0, 0, :3, :3])
+    print("Mean of pixel values:", pixel_values.mean().item())
+    print("Shape of pixel values:", pixel_values.shape)
+
+    import requests
+    from PIL import Image
+    from torchvision import transforms
+
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    transforms = transforms.Compose(
+        [
+            transforms.Resize((image_size, image_size)),
+            transforms.ToTensor(),
+        ]
+    )
+    pixel_values = transforms(image).unsqueeze(0)
+
+    # forward pass
+    with torch.no_grad():
+        outputs = model(pixel_values)
+
+    predicted_depth = outputs.predicted_depth
+
+    print("Shape of predicted depth:", predicted_depth.shape)
+    print("First values of predicted depth:", predicted_depth[0, :3, :3])
+
+    # assert logits
+    # TODO there's still a small difference with the original logits
+    if model_name == "dpt-beit-large-512":
+        # OK, checked
+        expected_shape = torch.Size([1, 512, 512])
+        expected_slice = torch.tensor(
+            [[2804.6260, 2792.5708, 2812.9263], [2772.0288, 2780.1118, 2796.2529], [2748.1094, 2766.6558, 2766.9834]]
+        )
+    elif model_name == "dpt-beit-large-384":
+        # OK, checked
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor(
+            [[1783.2273, 1780.5729, 1792.6453], [1759.9817, 1765.5359, 1778.5002], [1739.1633, 1754.7903, 1757.1990]],
+        )
+    elif model_name == "dpt-beit-base-384":
+        # OK, checked
+        expected_shape = torch.Size([1, 384, 384])
+        expected_slice = torch.tensor(
+            [[2898.4482, 2891.3750, 2904.8079], [2858.6685, 2877.2615, 2894.4507], [2842.1235, 2854.1023, 2861.6328]],
+        )
+
+    assert predicted_depth.shape == torch.Size(expected_shape)
+    assert torch.allclose(predicted_depth[0, :3, :3], expected_slice)
+    print("Looks ok!")
+
+    if pytorch_dump_folder_path is not None:
+        Path(pytorch_dump_folder_path).mkdir(exist_ok=True)
+        print(f"Saving model and processor to {pytorch_dump_folder_path}")
+        model.save_pretrained(pytorch_dump_folder_path)
+        processor.save_pretrained(pytorch_dump_folder_path)
+
+    if push_to_hub:
+        print("Pushing model and processor to hub...")
+        model.push_to_hub(repo_id=f"nielsr/{model_name}")
+        processor.push_to_hub(repo_id=f"nielsr/{model_name}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--model_name",
+        default="dpt-beit-large-512",
+        type=str,
+        choices=["dpt-beit-large-512", "dpt-beit-large-384", "dpt-beit-base-384"],
+        help="Name of the model you'd like to convert.",
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path",
+        default=None,
+        type=str,
+        help="Path to the output PyTorch model directory.",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether to push the model to the hub after conversion.",
+    )
+
+    args = parser.parse_args()
+    convert_dpt_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub)
diff --git a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
index 5b4fc5884c10e3..afd8963359fc9a 100644
--- a/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
+++ b/src/transformers/models/encoder_decoder/modeling_tf_encoder_decoder.py
@@ -291,16 +291,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         return self.decoder.set_output_embeddings(new_embeddings)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""
-        Example:
-
-        ```python
-        >>> from transformers import TFEncoderDecoderModel
-
-        >>> model = TFEncoderDecoderModel.from_pretrained("ydshieh/bert2bert-cnn_dailymail-fp16")
-        ```"""
+    def tf_to_pt_weight_rename(self, tf_weight):
         # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
         # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
         # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
@@ -311,18 +302,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
         # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
         # or not.
-
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        encoder_model_type = config.encoder.model_type
-
-        def tf_to_pt_weight_rename(tf_weight):
-            if "encoder" in tf_weight and "decoder" not in tf_weight:
-                return re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight)
-            else:
-                return tf_weight
-
-        kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        encoder_model_type = self.config.encoder.model_type
+        if "encoder" in tf_weight and "decoder" not in tf_weight:
+            return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
+        else:
+            return (tf_weight,)
 
     @classmethod
     def from_encoder_decoder_pretrained(
diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py
index d4c647c846fa1a..78088bc5d79027 100644
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@@ -16,7 +16,7 @@
 
 import math
 import warnings
-from typing import Optional, Tuple, Union
+from typing import TYPE_CHECKING, Optional, Tuple, Union
 
 import torch
 import torch.utils.checkpoint
@@ -24,7 +24,11 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss
 from torch.nn import functional as F
 
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -33,16 +37,21 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import is_torch_greater_or_equal_than_2_0
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from .configuration_falcon import FalconConfig
 
 
+if TYPE_CHECKING:
+    from ...configuration_utils import PretrainedConfig
+
 if is_flash_attn_2_available():
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
@@ -141,7 +150,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -171,7 +180,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -198,7 +207,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -277,6 +286,7 @@ def __init__(self, config: FalconConfig):
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.is_causal = True
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         if self.head_dim * self.num_heads != self.hidden_size:
             raise ValueError(
@@ -437,17 +447,23 @@ def forward(
         else:
             present = None
 
-        if alibi is None:
-            if hasattr(F, "scaled_dot_product_attention") and not output_attentions:
-                # TODO: deprecate this once we add FA2 support in Falcon
-                logger.warning_once(
-                    "The current implementation of Falcon calls `torch.scaled_dot_product_attention` directly, this will be deprecated in the"
-                    " future in favor of the `BetterTransformer` API. Please install the latest optimum library with `pip install -U optimum` and call "
-                    "`model.to_bettertransformer()` to benefit from `torch.scaled_dot_product_attention` and future performance optimizations."
-                )
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_layer.device.type == "cuda" and attention_mask is not None:
+            query_layer = query_layer.contiguous()
+            key_layer = key_layer.contiguous()
+            value_layer = value_layer.contiguous()
 
+        if alibi is None:
+            if self._use_sdpa and not output_attentions:
                 attn_output = F.scaled_dot_product_attention(
-                    query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attention_mask,
+                    0.0,
+                    # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
                 )
                 attention_scores = None
             else:
@@ -455,58 +471,70 @@ def forward(
                 attention_scores /= math.sqrt(self.head_dim)
 
                 attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
                 attn_output = attention_scores @ value_layer
 
             attn_output = attn_output.view(batch_size, self.num_heads, query_length, self.head_dim)
             attn_output = attn_output.permute(0, 2, 1, 3)
             attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
 
-            output_tensor = self.dense(attn_output)
+            attn_output = self.dense(attn_output)
 
             if output_attentions:
-                return output_tensor, present, attention_scores
+                return attn_output, present, attention_scores
             else:
-                return output_tensor, present
+                return attn_output, present
 
         else:
-            matmul_result = query_layer @ key_layer.transpose(-1, -2)
+            if self._use_sdpa and not output_attentions and head_mask is None:
+                attn_output = F.scaled_dot_product_attention(
+                    query_layer,
+                    key_layer,
+                    value_layer,
+                    attn_mask=attention_mask,
+                    dropout_p=self.attention_dropout.p if self.training else 0.0,
+                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
+                )
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+                attn_output = self.dense(attn_output)
+            else:
+                matmul_result = query_layer @ key_layer.transpose(-1, -2)
+
+                # change view to [batch_size, num_heads, q_length, kv_length]
+                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
 
-            # change view to [batch_size, num_heads, q_length, kv_length]
-            attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+                # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+                input_dtype = attention_scores.dtype
+                # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+                if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                    attention_scores = attention_scores.to(torch.float32)
 
-            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-            input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
-                attention_scores = attention_scores.to(torch.float32)
-            # Matt (HF) note: We could possibly use F.scaled_dot_product_attention here too, by
-            # adding (alibi * self.inv_norm_factor) to attention_mask. I think this would be mathematically
-            # equivalent and more performant, but there might be a numerical difference. If you're reading this
-            # and you'd like to experiment and maybe file a PR, feel free!
-            attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
-            attention_logits *= self.inv_norm_factor
-            attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
-            # [batch_size, num_heads, q_length, kv_length]
-            attention_probs = self.attention_dropout(attention_probs)
+                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+                attention_logits *= self.inv_norm_factor
+                attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # [batch_size, num_heads, q_length, kv_length]
+                attention_probs = self.attention_dropout(attention_probs)
 
-            if head_mask is not None:
-                attention_probs = attention_probs * head_mask
+                if head_mask is not None:
+                    attention_probs = attention_probs * head_mask
 
-            # change view [batch_size, num_heads, q_length, kv_length]
-            attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+                # change view [batch_size, num_heads, q_length, kv_length]
+                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
 
-            # matmul: [batch_size * num_heads, q_length, head_dim]
-            context_layer = (attention_probs_reshaped @ value_layer).flatten(0, 1)
+                # matmul: [batch_size * num_heads, q_length, head_dim]
+                attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
 
-            # change view [batch_size, q_length, num_heads * head_dim]
-            context_layer = self._merge_heads(context_layer)
+                # change view [batch_size, q_length, num_heads * head_dim]
+                attn_output = self._merge_heads(attn_output)
 
-            output_tensor = self.dense(context_layer)
+                attn_output = self.dense(attn_output)
 
             if output_attentions:
-                return output_tensor, present, attention_probs
+                return attn_output, present, attention_probs
             else:
-                return output_tensor, present
+                return attn_output, present
 
 
 class FalconFlashAttention2(FalconAttention):
@@ -516,6 +544,15 @@ class FalconFlashAttention2(FalconAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -564,6 +601,12 @@ def forward(
 
         past_key_value = (key_layer, value_layer) if use_cache else None
 
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_layer = query_layer.transpose(1, 2)
+        key_layer = key_layer.transpose(1, 2)
+        value_layer = value_layer.transpose(1, 2)
+
         if alibi is not None:
             raise ValueError("`alibi` is not supported when `use_flash_attn` is True")
 
@@ -625,6 +668,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -645,13 +694,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -712,17 +761,20 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return x
 
 
+FALCON_ATTENTION_CLASSES = {
+    "eager": FalconAttention,
+    "sdpa": FalconAttention,  # FalconAttention originally implemented both a forward with & without SDPA
+    "flash_attention_2": FalconFlashAttention2,
+}
+
+
 class FalconDecoderLayer(nn.Module):
     def __init__(self, config: FalconConfig):
         super().__init__()
         hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
 
-        self.self_attention = (
-            FalconAttention(config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else FalconFlashAttention2(config)
-        )
+        self.self_attention = FALCON_ATTENTION_CLASSES[config._attn_implementation](config)
         self.mlp = FalconMLP(config)
         self.hidden_dropout = config.hidden_dropout
         self.config = config
@@ -890,6 +942,7 @@ class FalconPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["FalconDecoderLayer"]
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -910,6 +963,25 @@ def _init_weights(self, module: nn.Module):
             module.bias.data.zero_()
             module.weight.data.fill_(1.0)
 
+    # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> "PretrainedConfig":
+        # NOTE: Falcon supported SDPA from PyTorch 2.0. We keep it like that for backward compatibility (automatically use SDPA for torch>=2.0).
+        if hard_check_only:
+            if not is_torch_greater_or_equal_than_2_0:
+                raise ImportError("PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.0.")
+
+        if not is_torch_greater_or_equal_than_2_0:
+            return config
+
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+        if _is_bettertransformer:
+            return config
+
+        if not hard_check_only:
+            config._attn_implementation = "sdpa"
+        return config
+
 
 @add_start_docstrings(
     "The bare Falcon Model transformer outputting raw hidden-states without any specific head on top.",
@@ -928,6 +1000,8 @@ def __init__(self, config: FalconConfig):
 
         # Transformer blocks
         self.h = nn.ModuleList([FalconDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         # Final Layer Norm
         self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
@@ -981,12 +1055,6 @@ def forward(
         if past_key_values is None:
             past_key_values = tuple([None] * len(self.h))
 
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape batch_size x num_heads x N x N
-        # head_mask has shape n_layer x batch x num_heads x N x N
-        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
-
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
 
@@ -1025,15 +1093,61 @@ def forward(
                 )
                 position_ids = position_ids.unsqueeze(0)
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            if alibi is None:
+                attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                    attention_mask,
+                    (batch_size, seq_length),
+                    inputs_embeds,
+                    past_key_values_length,
+                )
+            elif head_mask is None:
+                alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:])
+
+                attention_mask_2d = attention_mask
+                # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched.
+                attention_mask = _prepare_4d_causal_attention_mask(
+                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+                )
+
+                # We take care to integrate alibi bias in the attention_mask here.
+                if attention_mask_2d is None:
+                    attention_mask = alibi / math.sqrt(self.config.hidden_size // self.num_heads)
+                else:
+                    attention_mask = torch.masked_fill(
+                        alibi / math.sqrt(self.config.hidden_size // self.num_heads),
+                        attention_mask < -1,
+                        torch.finfo(alibi.dtype).min,
+                    )
+
+                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+                    # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+                    if seq_length > 1:
+                        attention_mask = AttentionMaskConverter._unmask_unattended(
+                            attention_mask, attention_mask_2d, unmasked_value=0.0
+                        )
+            else:
+                # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case.
+                attention_mask = _prepare_4d_causal_attention_mask(
+                    attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
+                )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
                 attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
             )
 
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape batch_size x num_heads x N x N
+        # head_mask has shape n_layer x batch x num_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
diff --git a/src/transformers/models/fuyu/image_processing_fuyu.py b/src/transformers/models/fuyu/image_processing_fuyu.py
index 0e415980c97fdd..ba5d52a58a80af 100644
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@@ -54,7 +54,7 @@
 
 
 def make_list_of_list_of_images(
-    images: Union[List[List[ImageInput]], List[ImageInput], ImageInput]
+    images: Union[List[List[ImageInput]], List[ImageInput], ImageInput],
 ) -> List[List[ImageInput]]:
     if is_valid_image(images):
         return [[images]]
diff --git a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 2d12d590974276..b6b03d07b9d11e 100644
--- a/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/src/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -22,6 +22,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...modeling_attn_mask_utils import AttentionMaskConverter
 from ...modeling_outputs import (
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
@@ -34,6 +35,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
 )
 from .configuration_gpt_bigcode import GPTBigCodeConfig
@@ -127,6 +129,7 @@ def __init__(self, config, is_cross_attention=False, layer_idx=None):
         self.scale_attention_softmax_in_fp32 = (
             config.scale_attention_softmax_in_fp32 and config.attention_softmax_in_fp32
         )
+        self.attn_pdrop = config.attn_pdrop
 
         if self.is_cross_attention:
             if self.multi_query:
@@ -292,6 +295,15 @@ class GPTBigCodeFlashAttention2(GPTBigCodeAttention):
     API of flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -349,7 +361,7 @@ def forward(
             key = key.transpose(1, 2).reshape(batch_size, tgt, self.num_heads, self.head_dim)
             value = value.transpose(1, 2).reshape(batch_size, tgt, self.num_heads, self.head_dim)
 
-        attn_dropout = self.config.attn_pdrop if self.training else 0.0
+        attn_dropout = self.attn_pdrop if self.training else 0.0
 
         softmax_dtype = torch.float32 if self.attention_softmax_in_fp32 else query.dtype
         upcast = query.dtype != softmax_dtype
@@ -422,6 +434,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -442,13 +460,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -493,6 +511,150 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+class GPTBigCodeSdpaAttention(GPTBigCodeAttention):
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        if head_mask is not None:
+            # The super dispatch is done in the forward.
+            raise ValueError(
+                "PyTorch SDPA does not support head_mask. Please open an issue in Transformers repository."
+            )
+
+        scale = None
+        if not self.scale_attn_weights:
+            scale = 1
+
+        # MQA models: (batch_size, query_length, num_heads * head_dim)
+        # MHA models: (batch_size, num_heads, query_length, head_dim)
+        query_shape = query.shape
+        batch_size = query_shape[0]
+        key.shape[-2]
+
+        if self.multi_query:
+            query_length = query_shape[1]
+
+            # SDPA requires the dimension [..., sequence_length, head_dim].
+            query = query.view(batch_size, query_length, self.num_heads, self.head_dim).transpose(1, 2)
+
+            # Without these unsqueeze, SDPA complains as the query and key/value have a different number of dimensions.
+            key = key.unsqueeze(1)
+            value = value.unsqueeze(1)
+
+            # Although these expand are not numerically useful, PyTorch 2.1 can not dispatch to memory-efficient backend
+            # and flash attention backend (No available kernel.  Aborting execution.) from the shapes
+            # query = [batch_size, num_heads, query_length, head_dim]
+            # key = [batch_size, 1, past_length, head_dim]
+            # value = [batch_size, 1, past_length, head_dim]
+            #
+            # so we could do:
+            #
+            # key = key.expand(-1, self.num_heads, -1, -1)
+            # value = value.expand(-1, self.num_heads, -1, -1)
+            #
+            # However SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+            # so we always dispatch to the math path: https://github.com/pytorch/pytorch/issues/112577.
+            # Arguably we could still do expand + contiguous when `query.device.type == "cuda"` in order to dispatch on memory-efficient
+            # backend, but it feels very hacky.
+        else:
+            query_length = query_shape[-1]
+
+            # See the comment above.
+            if query.device.type == "cuda" and attention_mask is not None:
+                query = query.contiguous()
+                key = key.contiguous()
+                value = value.contiguous()
+
+        sdpa_result = torch.nn.functional.scaled_dot_product_attention(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            dropout_p=self.attn_pdrop if self.training else 0.0,
+            # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+            is_causal=self.is_causal and attention_mask is None and query_length > 1,
+            scale=scale,
+        )
+
+        if self.multi_query:
+            # (batch_size, num_heads, seq_len, head_dim) --> (batch_size, seq_len, num_heads, head_dim)
+            sdpa_result = sdpa_result.transpose(1, 2)
+
+            # Reshape is kind of expensive here, as it does a memory copy,
+            # but I did not manage to make away without it (logits do not match when using view)
+            # (batch_size, seq_len, num_heads, head_dim) --> (batch_size, seq_len, num_heads * head_dim)
+            sdpa_result = sdpa_result.reshape(query_shape)
+
+        return sdpa_result, None
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> Union[
+        Tuple[torch.Tensor, Optional[torch.Tensor]],
+        Tuple[torch.Tensor, Optional[torch.Tensor], Tuple[torch.Tensor, ...]],
+    ]:
+        if encoder_hidden_states is not None:
+            if not hasattr(self, "q_attn") or not self.is_cross_attention:
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `GPTBigCodeAttention(..., is_cross_attention=True)`."
+                )
+
+            query = self.q_attn(hidden_states)
+            key_value = self.c_attn(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
+        elif self.multi_query:
+            query, key_value = self.c_attn(hidden_states).split((self.embed_dim, 2 * self.kv_dim), dim=2)
+        else:
+            # Note: We split as (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim),
+            # i.e., the memory layout is not the same as GPT2.
+            # This makes the concatenation with past_key_value more efficient.
+            query, key_value = (
+                self.c_attn(hidden_states)
+                .view(*hidden_states.shape[:2], self.num_heads, 3 * self.head_dim)
+                .transpose(1, 2)
+                .split((self.head_dim, 2 * self.head_dim), dim=3)
+            )
+
+        if layer_past is not None:
+            key_value = torch.cat((layer_past, key_value), dim=-2)
+        present = key_value if use_cache else None
+
+        key, value = key_value.split((self.head_dim, self.head_dim), dim=-1)
+
+        if not output_attentions and head_mask is None:
+            # Difference with the original implementation: there is no need to transpose the key here,
+            # as SDPA expects seq_length to be at index -2 for the key as well
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+        else:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "GPTBigCodeModel is using GPTBigCodeSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` and `head_mask` not None."
+                ' Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            attn_output, attn_weights = super()._attn(query, key.transpose(-1, -2), value, attention_mask, head_mask)
+
+        if not self.multi_query:
+            attn_output = attn_output.transpose(1, 2).reshape(hidden_states.shape)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            if self.multi_query:
+                # Transpose to return weights in the usual format (batch_size, num_heads, query_length, key_length)
+                attn_weights = attn_weights.transpose(1, 2)
+            outputs += (attn_weights,)
+
+        return outputs
+
+
 class GPTBigCodeMLP(nn.Module):
     def __init__(self, intermediate_size, config):
         super().__init__()
@@ -511,6 +673,13 @@ def forward(self, hidden_states: Optional[Tuple[torch.FloatTensor]]) -> torch.Fl
         return hidden_states
 
 
+GPTBIGCODE_ATTENTION_CLASSES = {
+    "eager": GPTBigCodeAttention,
+    "flash_attention_2": GPTBigCodeFlashAttention2,
+    "sdpa": GPTBigCodeSdpaAttention,
+}
+
+
 class GPTBigCodeBlock(nn.Module):
     def __init__(self, config, layer_idx=None):
         super().__init__()
@@ -518,21 +687,19 @@ def __init__(self, config, layer_idx=None):
         self.inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
 
         self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
-        self.attn = (
-            GPTBigCodeAttention(config, layer_idx=layer_idx)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else GPTBigCodeFlashAttention2(config, layer_idx=layer_idx)
-        )
+
+        self.attn = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
+
         self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
         if config.add_cross_attention:
             if config.multi_query:
                 raise NotImplementedError("Cross-attention not implemented for MQA")
-            self.crossattention = (
-                GPTBigCodeAttention(config, is_cross_attention=True, layer_idx=layer_idx)
-                if not getattr(config, "_flash_attn_2_enabled", False)
-                else GPTBigCodeFlashAttention2(config, is_cross_attention=True, layer_idx=layer_idx)
+
+            self.crossattention = GPTBIGCODE_ATTENTION_CLASSES[config._attn_implementation](
+                config, is_cross_attention=True, layer_idx=layer_idx
             )
+
             self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
 
         self.mlp = GPTBigCodeMLP(self.inner_dim, config)
@@ -613,6 +780,7 @@ class GPTBigCodePreTrainedModel(PreTrainedModel):
     _no_split_modules = ["GPTBigCodeBlock"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def __init__(self, *inputs, **kwargs):
         super().__init__(*inputs, **kwargs)
@@ -754,6 +922,9 @@ def __init__(self, config):
 
         self.gradient_checkpointing = False
 
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+
         # Initialize weights and apply final processing
         self.post_init()
 
@@ -834,7 +1005,7 @@ def forward(
         key_length = past_length + query_length
         self_attention_mask = self.bias[None, key_length - query_length : key_length, :key_length]
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask.bool() if (attention_mask is not None and 0 in attention_mask) else None
             encoder_attention_mask = (
@@ -851,7 +1022,34 @@ def forward(
 
             # MQA models: (batch_size, query_length, n_heads, key_length)
             # MHA models: (batch_size, n_heads, query_length, key_length)
-            attention_mask = self_attention_mask.unsqueeze(2 if self.multi_query else 1)
+            self_attention_mask = self_attention_mask.unsqueeze(2 if self.multi_query else 1)
+
+            if self._use_sdpa and head_mask is None and not output_attentions:
+                # output_attentions=True can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                if self.multi_query:
+                    # gpt_bigcode using MQA has the bad taste to use a causal mask with shape
+                    # [batch_size, target_length, 1, source_length], not compatible with SDPA, hence this transpose.
+                    self_attention_mask = self_attention_mask.transpose(1, 2)
+
+                if query_length > 1 and attention_mask is not None:
+                    # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend
+                    # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213
+                    self_attention_mask = AttentionMaskConverter._unmask_unattended(
+                        self_attention_mask, attention_mask, unmasked_value=True
+                    )
+
+                # SDPA with a custom mask is much faster in fp16/fp32 dtype rather than bool. Cast here to floating point instead of at every layer.
+                dtype = self.wte.weight.dtype
+                self_attention_mask = torch.where(
+                    self_attention_mask,
+                    torch.full([], 0.0, dtype=dtype, device=self_attention_mask.device),
+                    torch.full(
+                        [], torch.finfo(self.wte.weight.dtype).min, dtype=dtype, device=self_attention_mask.device
+                    ),
+                )
+
+            attention_mask = self_attention_mask
 
             # If a 2D or 3D attention mask is provided for the cross-attention
             # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
diff --git a/src/transformers/models/gpt_neo/modeling_gpt_neo.py b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
index 368920f3769c6f..a6a73dbb8cfd9f 100755
--- a/src/transformers/models/gpt_neo/modeling_gpt_neo.py
+++ b/src/transformers/models/gpt_neo/modeling_gpt_neo.py
@@ -36,11 +36,13 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
 from ...utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     is_torch_fx_available,
     logging,
 )
@@ -55,6 +57,9 @@
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 
 
@@ -295,6 +300,15 @@ class GPTNeoFlashAttention2(GPTNeoSelfAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states,
@@ -396,6 +410,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -416,13 +436,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -467,6 +487,12 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+GPT_NEO_ATTENTION_CLASSES = {
+    "eager": GPTNeoSelfAttention,
+    "flash_attention_2": GPTNeoFlashAttention2,
+}
+
+
 class GPTNeoAttention(nn.Module):
     def __init__(self, config, layer_id=0):
         super().__init__()
@@ -475,11 +501,7 @@ def __init__(self, config, layer_id=0):
         self.attention_type = self.attention_layers[layer_id]
 
         if self.attention_type in ["global", "local"]:
-            self.attention = (
-                GPTNeoSelfAttention(config, self.attention_type)
-                if not getattr(config, "_flash_attn_2_enabled", False)
-                else GPTNeoFlashAttention2(config, self.attention_type)
-            )
+            self.attention = GPT_NEO_ATTENTION_CLASSES[config._attn_implementation](config, self.attention_type)
         else:
             raise NotImplementedError(
                 "Only attn layer types 'global' and 'local' exist, but got `config.attention_layers`: "
@@ -698,6 +720,7 @@ def __init__(self, config):
         self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
         self.drop = nn.Dropout(float(config.embed_dropout))
         self.h = nn.ModuleList([GPTNeoBlock(config, layer_id=i) for i in range(config.num_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
 
         self.gradient_checkpointing = False
@@ -775,7 +798,7 @@ def forward(
         hidden_states = inputs_embeds + position_embeds
 
         # Attention mask.
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         else:
diff --git a/src/transformers/models/gpt_neox/modeling_gpt_neox.py b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
index 33cc00cc2f42fd..d1c10f58d9d67a 100755
--- a/src/transformers/models/gpt_neox/modeling_gpt_neox.py
+++ b/src/transformers/models/gpt_neox/modeling_gpt_neox.py
@@ -20,6 +20,7 @@
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from torch.nn import functional as F
 
 from ...activations import ACT2FN
 from ...file_utils import (
@@ -36,10 +37,15 @@
     TokenClassifierOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import logging
+from ...utils import is_flash_attn_2_available, is_flash_attn_greater_or_equal_2_10, logging
 from .configuration_gpt_neox import GPTNeoXConfig
 
 
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "trl-internal-testing/tiny-random-GPTNeoXForCausalLM"
@@ -52,6 +58,19 @@
 ]
 
 
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
 class GPTNeoXPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
@@ -63,6 +82,7 @@ class GPTNeoXPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["GPTNeoXLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         """Initialize the weights"""
@@ -100,6 +120,7 @@ def __init__(self, config):
         self.query_key_value = nn.Linear(config.hidden_size, 3 * config.hidden_size)
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.attention_dropout = nn.Dropout(config.attention_dropout)
+        self.is_causal = True
 
     def _init_bias(self, max_positions, device=None):
         self.register_buffer(
@@ -146,6 +167,7 @@ def forward(
         layer_past: Optional[Tuple[torch.Tensor]] = None,
         use_cache: Optional[bool] = False,
         output_attentions: Optional[bool] = False,
+        padding_mask: Optional[torch.Tensor] = None,
     ):
         has_layer_past = layer_past is not None
 
@@ -277,6 +299,226 @@ def _attn(self, query, key, value, attention_mask=None, head_mask=None):
         return attn_output, attn_weights
 
 
+class GPTNeoXFlashAttention2(GPTNeoXAttention):
+    """
+    GPTNeoX flash attention module. This module inherits from `GPTNeoXAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        attention_mask: torch.FloatTensor,
+        position_ids: torch.LongTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        has_layer_past = layer_past is not None
+
+        # Compute QKV
+        # Attention heads [batch, seq_len, hidden_size]
+        #   --> [batch, seq_len, (np * 3 * head_size)]
+        qkv = self.query_key_value(hidden_states)
+
+        # [batch, seq_len, (num_heads * 3 * head_size)]
+        #   --> [batch, seq_len, num_heads, 3 * head_size]
+        new_qkv_shape = qkv.size()[:-1] + (self.num_attention_heads, 3 * self.head_size)
+        qkv = qkv.view(*new_qkv_shape)
+
+        # [batch, seq_len, num_attention_heads, 3 * head_size] --> 3 [batch, num_attention_heads, seq_len, head_size]
+        query = qkv[..., : self.head_size].permute(0, 2, 1, 3)
+        key = qkv[..., self.head_size : 2 * self.head_size].permute(0, 2, 1, 3)
+        value = qkv[..., 2 * self.head_size :].permute(0, 2, 1, 3)
+
+        query_length = query.shape[-2]
+
+        # Compute rotary embeddings on rotary_ndims
+        query_rot = query[..., : self.rotary_ndims]
+        query_pass = query[..., self.rotary_ndims :]
+        key_rot = key[..., : self.rotary_ndims]
+        key_pass = key[..., self.rotary_ndims :]
+
+        # Compute token offset for rotary embeddings (when decoding)
+        seq_len = key.shape[-2]
+        if has_layer_past:
+            seq_len += layer_past[0].shape[-2]
+        cos, sin = self.rotary_emb(value, seq_len=seq_len)
+        query, key = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+        query = torch.cat((query, query_pass), dim=-1)
+        key = torch.cat((key, key_pass), dim=-1)
+
+        # Cache QKV values
+        if has_layer_past:
+            past_key = layer_past[0]
+            past_value = layer_past[1]
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+        present = (key, value) if use_cache else None
+
+        # GPT-neo-X casts query and key in fp32 to apply rotary embedding in full precision
+        target_dtype = value.dtype
+        if query.dtype != target_dtype:
+            query = query.to(target_dtype)
+        if key.dtype != target_dtype:
+            key = key.to(target_dtype)
+
+        # Permute to get the expected shape for Flash Attention
+        query = query.permute(0, 2, 1, 3)
+        key = key.permute(0, 2, 1, 3)
+        value = value.permute(0, 2, 1, 3)
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 / bfloat16 just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        input_dtype = query.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query = query.to(target_dtype)
+            key = key.to(target_dtype)
+            value = value.to(target_dtype)
+
+        attention_dropout = self.config.attention_dropout if self.training else 0.0
+
+        # Compute attention
+        attn_weights = self._flash_attention_forward(
+            query, key, value, attention_mask, query_length, dropout=attention_dropout, softmax_scale=self.norm_factor
+        )
+
+        # Reshape outputs
+        attn_output = attn_weights.reshape(
+            attn_weights.shape[0], attn_weights.shape[1], self.num_attention_heads * self.head_size
+        )
+        attn_output = self.dense(attn_output)
+
+        outputs = (attn_output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input with num_heads->num_attention_heads
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_attention_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
 def attention_mask_func(attention_scores, ltor_mask):
     attention_scores.masked_fill_(~ltor_mask, torch.finfo(attention_scores.dtype).min)
     return attention_scores
@@ -302,7 +544,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -332,7 +574,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -359,7 +601,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -416,6 +658,12 @@ def forward(self, hidden_states):
         return hidden_states
 
 
+GPT_NEOX_ATTENTION_CLASSES = {
+    "eager": GPTNeoXAttention,
+    "flash_attention_2": GPTNeoXFlashAttention2,
+}
+
+
 class GPTNeoXLayer(nn.Module):
     def __init__(self, config):
         super().__init__()
@@ -424,7 +672,7 @@ def __init__(self, config):
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_dropout = nn.Dropout(config.hidden_dropout)
         self.post_mlp_dropout = nn.Dropout(config.hidden_dropout)
-        self.attention = GPTNeoXAttention(config)
+        self.attention = GPT_NEOX_ATTENTION_CLASSES[config._attn_implementation](config)
         self.mlp = GPTNeoXMLP(config)
 
     def forward(
@@ -539,6 +787,7 @@ def __init__(self, config):
         self.emb_dropout = nn.Dropout(config.hidden_dropout)
         self.layers = nn.ModuleList([GPTNeoXLayer(config) for _ in range(config.num_hidden_layers)])
         self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         self.gradient_checkpointing = False
 
@@ -615,20 +864,23 @@ def forward(
         if attention_mask is not None:
             assert batch_size > 0, "batch_size has to be defined and > 0"
             attention_mask = attention_mask.view(batch_size, -1)
-            # We create a 3D attention mask from a 2D tensor mask.
-            # Sizes are [batch_size, 1, 1, to_seq_length]
-            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-            # this attention mask is more simple than the triangular masking of causal attention
-            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-            attention_mask = attention_mask[:, None, None, :]
-
-            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-            # masked positions, this operation will create a tensor which is 0.0 for
-            # positions we want to attend and the dtype's smallest value for masked positions.
-            # Since we are adding it to the raw scores before the softmax, this is
-            # effectively the same as removing these entirely.
-            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
-            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+            if self._use_flash_attention_2:
+                attention_mask = attention_mask if 0 in attention_mask else None
+            else:
+                # We create a 3D attention mask from a 2D tensor mask.
+                # Sizes are [batch_size, 1, 1, to_seq_length]
+                # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+                # this attention mask is more simple than the triangular masking of causal attention
+                # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+                attention_mask = attention_mask[:, None, None, :]
+
+                # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+                # masked positions, this operation will create a tensor which is 0.0 for
+                # positions we want to attend and the dtype's smallest value for masked positions.
+                # Since we are adding it to the raw scores before the softmax, this is
+                # effectively the same as removing these entirely.
+                attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+                attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
diff --git a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
index 4f0841e3c34f95..d92787677161e0 100755
--- a/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/modeling_gpt_neox_japanese.py
@@ -253,7 +253,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
diff --git a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
index 2ac76be8d25982..cb0d85722262bc 100644
--- a/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
+++ b/src/transformers/models/gptsan_japanese/modeling_gptsan_japanese.py
@@ -188,17 +188,8 @@ def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[to
         hidden_states = hidden_states.to(self.dtype)
 
         if self.jitter_noise > 0:
-            # Get the lower and upper bound of the uniform distribution
-            # Adapted from: https://stackoverflow.com/questions/44328530/how-to-get-a-uniform-distribution-in-a-range-r1-r2-in-pytorch
-            distrib_lower_bound = 1.0 - self.jitter_noise
-            distrib_upper_bound = 1.0 + self.jitter_noise
-
-            uniform_distrib = torch.rand(hidden_states.shape, device=hidden_states.device, dtype=self.dtype)
-            uniform_distrib = uniform_distrib * (distrib_lower_bound - distrib_upper_bound)
-
-            uniform_distrib = uniform_distrib + distrib_upper_bound
             # Multiply the token inputs by the uniform distribution - adding some noise
-            hidden_states *= uniform_distrib
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
 
         # Shape: [num_groups, tokens_per_group, num_experts]
         self._cast_classifier()
diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py
index f7881ddd39ed77..928f500b27c9ee 100644
--- a/src/transformers/models/idefics/modeling_idefics.py
+++ b/src/transformers/models/idefics/modeling_idefics.py
@@ -29,7 +29,7 @@
 
 from ... import PreTrainedModel
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
 from ...modeling_outputs import ModelOutput
 from ...modeling_utils import PretrainedConfig
 from ...pytorch_utils import ALL_LAYERNORM_LAYERS
@@ -578,6 +578,7 @@ def __init__(
         self.num_heads = num_heads
         self.head_dim = hidden_size // num_heads
         self.dropout = dropout
+        self.is_causal = True
 
         if (self.head_dim * num_heads) != self.hidden_size:
             raise ValueError(
@@ -687,12 +688,21 @@ def forward(
                     f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                 )
 
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
         attn_output = nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=attention_mask,
             dropout_p=self.dropout,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
         )
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
@@ -864,16 +874,20 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         image_hidden_states: Optional[torch.Tensor] = None,
         image_attention_mask: Optional[torch.Tensor] = None,
+        cross_attention_gate: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = False,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
-        no_images: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
         Args:
             hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
             attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                 `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            image_attention_mask (`torch.FloatTensor`, *optional*): image attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            cross_attention_gate (`torch.FloatTensor`, *optional*):
+                gate of size `(batch, seq_len)` used to zero-out cross-attention output for tokens attending no images.
             output_attentions (`bool`, *optional*):
                 Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                 returned tensors for more detail.
@@ -881,7 +895,6 @@ def forward(
                 If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                 (see `past_key_values`).
             past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
-            no_images (`bool`, *optional*, defaults to `False`): If `True` the vision part is ignored
         """
         if image_hidden_states is None:
             raise ValueError(
@@ -889,6 +902,11 @@ def forward(
                 " conditioned on."
             )
 
+        if cross_attention_gate is None:
+            raise ValueError(
+                "`cross_attention_gate` is required for Idefics cross attention module to zero-out the cross-attention hidden_states attending to no images."
+            )
+
         if past_key_value is not None:
             raise NotImplementedError("Past key value states are not implemented for Idefics cross attention module.")
 
@@ -904,9 +922,9 @@ def forward(
             output_attentions=output_attentions,
         )
         hidden_states = nn.functional.dropout(hidden_states, p=self.config, training=self.training)
-        # when there are no images the model is used in pure language mode
-        gate = 0 if no_images else 1
-        hidden_states = residual + gate * self.act_cross_attn(self.alpha_cross_attn) * hidden_states
+        # Fill in zeros for cross_attention hidden_states of tokens attending to no images
+        hidden_states[cross_attention_gate == 0] = hidden_states[cross_attention_gate == 0].fill_(0)
+        hidden_states = residual + self.act_cross_attn(self.alpha_cross_attn) * hidden_states
 
         # Fully Connected
         residual = hidden_states
@@ -952,6 +970,7 @@ class IdeficsPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["IdeficsDecoderLayer", "IdeficsGatedCrossAttentionLayer"]
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         # important: this ported version of Idefics isn't meant for training from scratch - only
@@ -967,6 +986,18 @@ def _init_weights(self, module):
             if module.padding_idx is not None:
                 module.weight.data[module.padding_idx].zero_()
 
+    # Adapted from transformers.modeling_utils.PreTrainedModel._check_and_enable_sdpa
+    @classmethod
+    def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig:
+        # We remove the checks on `is_torch_sdpa_available()` and `cls._supports_sdpa` as Falcon supports SDPA from torch==2.0.0 (no requirement on 2.1).
+        _is_bettertransformer = getattr(cls, "use_bettertransformer", False)
+        if _is_bettertransformer:
+            return config
+
+        if not hard_check_only:
+            config._attn_implementation = "sdpa"
+        return config
+
 
 LLAMA_INPUTS_DOCSTRING = r"""
     Args:
@@ -1166,14 +1197,12 @@ def forward(
             )
             position_ids = position_ids.unsqueeze(0)
 
-        no_images = False
         if (pixel_values, image_encoder_embeddings, perceiver_embeddings).count(None) != 2:
             raise ValueError(
                 "Exactly 1 of pixel_values, image_encoder_embeddings or perceiver_embeddings has to be not-None."
             )
 
         elif pixel_values is not None:
-            no_images = len(torch.nonzero(pixel_values)) == 0
             pixel_values = pixel_values.to(dtype=self.dtype, device=device)  # fp16 compatibility
             batch_size, num_images = pixel_values.shape[:2]
             pixel_values = pixel_values.contiguous().view(batch_size * num_images, *pixel_values.shape[2:])
@@ -1218,6 +1247,15 @@ def forward(
         else:
             image_attention_mask = None
 
+        # cross_attention_gate:
+        # For any tokens attending to no images, the hidden_states comming out of the cross-attention should be zeroed-out.
+        # `image_attention_mask` has shape [bsz, 1, num_images, hidden_size] with elements equal to either 0.0 or a very negative number.
+        # If any of the elements are 0.0, then the token is attending to at least one image and the gate value is 1. Otherwise the gate value is 0.
+        # `cross_attention_gate` has shape [bsz, seq_len] with elements equal to either 0.0 or 1.0.
+        cross_attention_gate = ((((image_attention_mask == 0.0).any(dim=-1)).to(dtype=self.dtype)).squeeze(dim=1)).to(
+            device
+        )
+
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
         # embed positions
@@ -1225,7 +1263,7 @@ def forward(
             attention_mask = torch.ones(
                 (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
             )
-        attention_mask = _prepare_4d_causal_attention_mask(
+        attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
             attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
         )
 
@@ -1257,9 +1295,9 @@ def vblock(
                 past_key_value,
                 image_hidden_states,
                 image_attention_mask,
+                cross_attention_gate,
                 output_attentions,
                 use_cache,
-                no_images,
                 layer_idx,
                 cross_layer_interval,
                 gated_cross_attn_layers,
@@ -1272,10 +1310,10 @@ def vblock(
                         attention_mask=attention_mask,
                         image_hidden_states=image_hidden_states,
                         image_attention_mask=image_attention_mask,
+                        cross_attention_gate=cross_attention_gate,
                         output_attentions=output_attentions,
                         use_cache=use_cache,
                         past_key_value=None,  # not implemented
-                        no_images=no_images,
                     )
                     hidden_states = outputs[0]
 
@@ -1307,9 +1345,9 @@ def vblock(
                     past_key_value,
                     image_hidden_states,
                     image_attention_mask,
+                    cross_attention_gate,
                     output_attentions,
                     use_cache,
-                    no_images,
                     idx,
                     self.cross_layer_interval,
                     self.gated_cross_attn_layers,
@@ -1323,9 +1361,9 @@ def vblock(
                     past_key_value=past_key_value,
                     image_hidden_states=image_hidden_states,
                     image_attention_mask=image_attention_mask,
+                    cross_attention_gate=cross_attention_gate,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
-                    no_images=no_images,
                     layer_idx=idx,
                     cross_layer_interval=self.cross_layer_interval,
                     gated_cross_attn_layers=self.gated_cross_attn_layers,
@@ -1495,9 +1533,10 @@ def forward(
 
         loss = None
         if labels is not None:
+            labels = labels.to(logits.device)
             # Shift so that tokens < n predict n
             if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
+                shift_attention_mask = attention_mask[..., 1:].to(logits.device)
                 shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous()
                 shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous()
             else:
diff --git a/src/transformers/models/informer/modeling_informer.py b/src/transformers/models/informer/modeling_informer.py
index 3abf48eaec03df..0fe108a6402425 100644
--- a/src/transformers/models/informer/modeling_informer.py
+++ b/src/transformers/models/informer/modeling_informer.py
@@ -81,71 +81,70 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
         )
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -173,26 +172,29 @@ def forward(
         return scaled_data, torch.zeros_like(scale), scale
 
 
-# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeries->Informer
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Informer,TimeSeries->Informer
 class InformerNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: InformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -1447,11 +1449,11 @@ def __init__(self, config: InformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = InformerMeanScaler(dim=1, keepdim=True)
+            self.scaler = InformerMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = InformerStdScaler(dim=1, keepdim=True)
+            self.scaler = InformerStdScaler(config)
         else:
-            self.scaler = InformerNOPScaler(dim=1, keepdim=True)
+            self.scaler = InformerNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = InformerFeatureEmbedder(
diff --git a/src/transformers/models/led/modeling_led.py b/src/transformers/models/led/modeling_led.py
index 12a0c2a9915d21..4b0a5e6e85d1bc 100755
--- a/src/transformers/models/led/modeling_led.py
+++ b/src/transformers/models/led/modeling_led.py
@@ -1706,7 +1706,7 @@ def _pad_to_window_size(
 
         padding_len = (attention_window - seq_len % attention_window) % attention_window
         if padding_len > 0:
-            logger.info(
+            logger.warning_once(
                 f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                 f"`config.attention_window`: {attention_window}"
             )
diff --git a/src/transformers/models/led/modeling_tf_led.py b/src/transformers/models/led/modeling_tf_led.py
index 879538bca76bf3..69e5576ed62c65 100644
--- a/src/transformers/models/led/modeling_tf_led.py
+++ b/src/transformers/models/led/modeling_tf_led.py
@@ -1859,7 +1859,7 @@ def _pad_to_window_size(
         padding_len = (attention_window - seq_len % attention_window) % attention_window
 
         if padding_len > 0:
-            logger.info(
+            logger.warning_once(
                 f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                 f"`config.attention_window`: {attention_window}"
             )
diff --git a/src/transformers/models/llama/__init__.py b/src/transformers/models/llama/__init__.py
index 939756084d79ce..b5e9a60cda6e3c 100644
--- a/src/transformers/models/llama/__init__.py
+++ b/src/transformers/models/llama/__init__.py
@@ -16,6 +16,7 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    is_flax_available,
     is_sentencepiece_available,
     is_tokenizers_available,
     is_torch_available,
@@ -55,6 +56,14 @@
         "LlamaForSequenceClassification",
     ]
 
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_flax_llama"] = ["FlaxLlamaForCausalLM", "FlaxLlamaModel", "FlaxLlamaPreTrainedModel"]
+
 
 if TYPE_CHECKING:
     from .configuration_llama import LLAMA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlamaConfig
@@ -83,6 +92,14 @@
     else:
         from .modeling_llama import LlamaForCausalLM, LlamaForSequenceClassification, LlamaModel, LlamaPreTrainedModel
 
+    try:
+        if not is_flax_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel, FlaxLlamaPreTrainedModel
+
 
 else:
     import sys
diff --git a/src/transformers/models/llama/convert_llama_weights_to_hf.py b/src/transformers/models/llama/convert_llama_weights_to_hf.py
index 8b5930779c9b6e..d2fc3a79aff1b4 100644
--- a/src/transformers/models/llama/convert_llama_weights_to_hf.py
+++ b/src/transformers/models/llama/convert_llama_weights_to_hf.py
@@ -91,6 +91,7 @@ def write_model(model_path, input_base_path, model_size, tokenizer_path=None, sa
 
     params = read_json(os.path.join(input_base_path, "params.json"))
     num_shards = NUM_SHARDS[model_size]
+    params = params.get("model", params)
     n_layers = params["n_layers"]
     n_heads = params["n_heads"]
     n_heads_per_shard = n_heads // num_shards
@@ -109,7 +110,7 @@ def write_model(model_path, input_base_path, model_size, tokenizer_path=None, sa
         tokenizer.save_pretrained(model_path)
     vocab_size = tokenizer.vocab_size if tokenizer_path is not None else 32000
 
-    if "n_kv_heads" in params:
+    if params.get("n_kv_heads", None) is not None:
         num_key_value_heads = params["n_kv_heads"]  # for GQA / MQA
         num_local_key_value_heads = n_heads_per_shard // num_key_value_heads
         key_value_dim = dim // num_key_value_heads
diff --git a/src/transformers/models/llama/modeling_flax_llama.py b/src/transformers/models/llama/modeling_flax_llama.py
new file mode 100644
index 00000000000000..9430eba41c6548
--- /dev/null
+++ b/src/transformers/models/llama/modeling_flax_llama.py
@@ -0,0 +1,732 @@
+# coding=utf-8
+# Copyright 2023 Meta AI, EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax LLaMA model."""
+from functools import partial
+from typing import Optional, Tuple
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import FlaxBaseModelOutput, FlaxCausalLMOutput
+from ...modeling_flax_utils import ACT2FN, FlaxPreTrainedModel, append_call_sample_docstring
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_llama import LlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlamaConfig"
+_CHECKPOINT_FOR_DOC = "afmck/testing-llama-tiny"
+_REAL_CHECKPOINT_FOR_DOC = "openlm-research/open_llama_3b_v2"
+
+LLAMA_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`LlamaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16`, or
+            `jax.numpy.bfloat16`.
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def create_sinusoidal_positions(num_pos, dim):
+    inv_freq = 1.0 / (10000 ** (np.arange(0, dim, 2) / dim))
+    freqs = np.einsum("i , j -> i j", np.arange(num_pos), inv_freq).astype("float32")
+
+    emb = np.concatenate((freqs, freqs), axis=-1)
+    out = np.concatenate((np.sin(emb)[:, None, :], np.cos(emb)[:, None, :]), axis=-1)
+    return jnp.array(out[:, :, :num_pos])
+
+
+def rotate_half(tensor):
+    """Rotates half the hidden dims of the input."""
+    rotate_half_tensor = jnp.concatenate(
+        (-tensor[..., tensor.shape[-1] // 2 :], tensor[..., : tensor.shape[-1] // 2]), axis=-1
+    )
+    return rotate_half_tensor
+
+
+def apply_rotary_pos_emb(tensor, sin_pos, cos_pos):
+    return (tensor * cos_pos) + (rotate_half(tensor) * sin_pos)
+
+
+class FlaxLlamaRMSNorm(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.epsilon = self.config.rms_norm_eps
+        self.weight = self.param("weight", lambda _, shape: jnp.ones(shape), self.config.hidden_size)
+
+    def __call__(self, hidden_states):
+        variance = jnp.asarray(hidden_states, dtype=jnp.float32)
+        variance = jnp.power(variance, 2)
+        variance = variance.mean(-1, keepdims=True)
+        # use `jax.numpy.sqrt` as `jax.lax.rsqrt` does not match `torch.rsqrt`
+        hidden_states = hidden_states / jnp.sqrt(variance + self.epsilon)
+
+        return self.weight * jnp.asarray(hidden_states, dtype=self.dtype)
+
+
+class FlaxLlamaRotaryEmbedding(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        head_dim = self.config.hidden_size // self.config.num_attention_heads
+        self.sincos = create_sinusoidal_positions(self.config.max_position_embeddings, head_dim)
+
+    def __call__(self, key, query, position_ids):
+        sincos = self.sincos[position_ids]
+        sin_pos, cos_pos = jnp.split(sincos, 2, axis=-1)
+
+        key = apply_rotary_pos_emb(key, sin_pos, cos_pos)
+        query = apply_rotary_pos_emb(query, sin_pos, cos_pos)
+
+        key = jnp.asarray(key, dtype=self.dtype)
+        query = jnp.asarray(query, dtype=self.dtype)
+
+        return key, query
+
+
+class FlaxLlamaAttention(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+    causal: bool = True
+    is_cross_attention: bool = False
+
+    def setup(self):
+        config = self.config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.attention_softmax_in_fp32 = self.dtype is not jnp.float32
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=config.attention_bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.o_proj = dense()
+
+        self.causal_mask = make_causal_mask(jnp.ones((1, config.max_position_embeddings), dtype="bool"), dtype="bool")
+        self.rotary_emb = FlaxLlamaRotaryEmbedding(config, dtype=self.dtype)
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    # Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoSelfAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slighly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        position_ids,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        query = self.q_proj(hidden_states)
+        key = self.k_proj(hidden_states)
+        value = self.v_proj(hidden_states)
+
+        query = self._split_heads(query)
+        key = self._split_heads(key)
+        value = self._split_heads(value)
+
+        key, query = self.rotary_emb(key, query, position_ids)
+
+        query_length, key_length = query.shape[1], key.shape[1]
+
+        if self.has_variable("cache", "cached_key"):
+            mask_shift = self.variables["cache"]["cache_index"]
+            max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+            causal_mask = lax.dynamic_slice(
+                self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+            )
+        else:
+            causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+
+        batch_size = hidden_states.shape[0]
+        causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+        attention_mask = combine_masks(attention_mask, causal_mask)
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.has_variable("cache", "cached_key") or init_cache:
+            key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask)
+
+        # transform boolean mask into float mask
+        attention_bias = lax.select(
+            attention_mask > 0,
+            jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+            jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+        )
+
+        # usual dot product attention
+        attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype
+        attn_weights = dot_product_attention_weights(
+            query,
+            key,
+            bias=attention_bias,
+            deterministic=deterministic,
+            dtype=attention_dtype,
+        )
+
+        if self.attention_softmax_in_fp32:
+            attn_weights = attn_weights.astype(self.dtype)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.o_proj(attn_output)
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxLlamaMLP(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        embed_dim = self.config.hidden_size
+        inner_dim = self.config.intermediate_size if self.config.intermediate_size is not None else 4 * embed_dim
+
+        kernel_init = jax.nn.initializers.normal(self.config.initializer_range)
+        self.act = ACT2FN[self.config.hidden_act]
+
+        self.gate_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.down_proj = nn.Dense(embed_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+        self.up_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, kernel_init=kernel_init)
+
+    def __call__(self, hidden_states):
+        up_proj_states = self.up_proj(hidden_states)
+        gate_states = self.act(self.gate_proj(hidden_states))
+
+        hidden_states = self.down_proj(up_proj_states * gate_states)
+        return hidden_states
+
+
+class FlaxLlamaDecoderLayer(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.input_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+        self.self_attn = FlaxLlamaAttention(self.config, dtype=self.dtype)
+        self.post_attention_layernorm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+        self.mlp = FlaxLlamaMLP(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        outputs = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+        )
+        # residual connection
+        attn_output = outputs[0]
+        hidden_states = residual + attn_output
+
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + hidden_states
+
+        return (hidden_states,) + outputs[1:]
+
+
+# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoPreTrainedModel with GPTNeo->Llama, GPT_NEO->LLAMA, transformer->model
+class FlaxLlamaPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = LlamaConfig
+    base_model_prefix = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: LlamaConfig,
+        input_shape: Tuple = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(rngs, input_ids, attention_mask, position_ids, return_dict=False)["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length))
+        attention_mask = jnp.ones_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        params: dict = None,
+        past_key_values: dict = None,
+        dropout_rng: jax.random.PRNGKey = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        batch_size, sequence_length = input_ids.shape
+
+        if position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `position_ids` when passing `past_key_values`.")
+
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        if attention_mask is None:
+            attention_mask = jnp.ones((batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be changed by FlaxLlamaAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        outputs = self.module.apply(
+            inputs,
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+            not train,
+            False,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+            mutable=mutable,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past_key_values = outputs
+            outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past_key_values = outputs
+            outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        return outputs
+
+
+class FlaxLlamaLayerCollection(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.blocks = [
+            FlaxLlamaDecoderLayer(self.config, dtype=self.dtype, name=str(i))
+            for i in range(self.config.num_hidden_layers)
+        ]
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = False,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for block in self.blocks:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            layer_outputs = block(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                deterministic=deterministic,
+                init_cache=init_cache,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+        # this contains possible `None` values - `FlaxLlamaModule` will filter them out
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        return outputs
+
+
+class FlaxLlamaModule(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.hidden_size = self.config.hidden_size
+        embedding_init = jax.nn.initializers.normal(stddev=self.config.initializer_range)
+        self.embed_tokens = nn.Embed(
+            self.config.vocab_size,
+            self.hidden_size,
+            embedding_init=embedding_init,
+            dtype=self.dtype,
+        )
+        self.layers = FlaxLlamaLayerCollection(self.config, dtype=self.dtype)
+        self.norm = FlaxLlamaRMSNorm(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic=True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        input_embeds = self.embed_tokens(input_ids.astype("i4"))
+
+        outputs = self.layers(
+            input_embeds,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = outputs[1] + (hidden_states,)
+            outputs = (hidden_states, all_hidden_states) + outputs[2:]
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=outputs[1],
+            attentions=outputs[-1],
+        )
+
+
+@add_start_docstrings(
+    "The bare Llama Model transformer outputting raw hidden-states without any specific head on top.",
+    LLAMA_START_DOCSTRING,
+)
+class FlaxLlamaModel(FlaxLlamaPreTrainedModel):
+    module_class = FlaxLlamaModule
+
+
+append_call_sample_docstring(
+    FlaxLlamaModel,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBaseModelOutput,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
+
+
+class FlaxLlamaForCausalLMModule(nn.Module):
+    config: LlamaConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.model = FlaxLlamaModule(self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.config.vocab_size,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        position_ids=None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        outputs = self.model(
+            input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        if not return_dict:
+            return (lm_logits,) + outputs[1:]
+
+        return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
+
+
+@add_start_docstrings(
+    """
+    The Llama Model transformer with a language modeling head (linear layer) on top.
+    """,
+    LLAMA_START_DOCSTRING,
+)
+# Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJForCausalLM with GPTJ->Llama
+class FlaxLlamaForCausalLM(FlaxLlamaPreTrainedModel):
+    module_class = FlaxLlamaForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since Llama uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxLlamaForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutput,
+    _CONFIG_FOR_DOC,
+    real_checkpoint=_REAL_CHECKPOINT_FOR_DOC,
+)
diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py
index b9625dd9213989..bba2680f5732e4 100644
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@@ -29,14 +29,21 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import AttentionMaskConverter, _prepare_4d_causal_attention_mask
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
-from ...pytorch_utils import ALL_LAYERNORM_LAYERS
+from ...pytorch_utils import ALL_LAYERNORM_LAYERS, is_torch_greater_or_equal_than_1_13
 from ...utils import (
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -52,6 +59,9 @@
 # This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
 # It means that the function will not be traced through and simply appear as a node in the graph.
 if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
     _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
 
 
@@ -74,9 +84,9 @@ def _get_unpad_data(attention_mask):
 
 def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None):
     warnings.warn(
-        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils.AttentionMaskConverter._prepare_4d_attention_mask"
+        "Calling `transformers.models.llama.modeling_llama._prepare_4d_attention_mask` is deprecated and will be removed in v4.37. Use `transformers.modeling_attn_mask_utils._prepare_4d_attention_mask"
     )
-    return AttentionMaskConverter._prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
+    return _prepare_4d_attention_mask(mask=mask, dtype=dtype, tgt_len=tgt_len)
 
 
 def _make_causal_mask(
@@ -129,7 +139,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -158,7 +168,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -184,7 +194,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -275,9 +285,17 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class LlamaAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: LlamaConfig):
+    def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
         self.attention_dropout = config.attention_dropout
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
@@ -335,7 +353,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         **kwargs,
@@ -375,16 +393,19 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
@@ -439,12 +460,20 @@ class LlamaFlashAttention2(LlamaAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         **kwargs,
@@ -475,24 +504,21 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
-
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
         query_states = query_states.transpose(1, 2)
         key_states = key_states.transpose(1, 2)
         value_states = value_states.transpose(1, 2)
 
-        dropout_rate = 0.0 if not self.training else self.attention_dropout
+        dropout_rate = self.attention_dropout if self.training else 0.0
 
         # In PEFT, usually we cast the layer norms in float32 for training stability reasons
         # therefore the input hidden states gets silently casted in float32. Hence, we need
@@ -552,6 +578,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -572,13 +604,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -622,15 +654,107 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+class LlamaSdpaAttention(LlamaAttention):
+    """
+    Llama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `LlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from LlamaAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if output_attentions:
+            # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "LlamaModel is using LlamaSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to the manual attention implementation, "
+                'but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_value=past_key_value,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
+            is_causal=self.is_causal and attention_mask is None and q_len > 1,
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
+LLAMA_ATTENTION_CLASSES = {
+    "eager": LlamaAttention,
+    "flash_attention_2": LlamaFlashAttention2,
+    "sdpa": LlamaSdpaAttention,
+}
+
+
 class LlamaDecoderLayer(nn.Module):
-    def __init__(self, config: LlamaConfig):
+    def __init__(self, config: LlamaConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = (
-            LlamaAttention(config=config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else LlamaFlashAttention2(config=config)
-        )
+
+        self.self_attn = LLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
         self.mlp = LlamaMLP(config)
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -725,6 +849,8 @@ class LlamaPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["LlamaDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -773,13 +899,19 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, decoder_sequence_length, embed_size_per_head)`.
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
 
             If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
             have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
@@ -820,7 +952,11 @@ def __init__(self, config: LlamaConfig):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([LlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList(
+            [LlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_sdpa = config._attn_implementation == "sdpa"
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.gradient_checkpointing = False
@@ -865,8 +1001,11 @@ def forward(
             raise ValueError("You have to specify either input_ids or inputs_embeds")
 
         past_key_values_length = 0
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
 
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -878,9 +1017,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions:
+            # output_attentions=True can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
@@ -900,21 +1048,19 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        next_decoder_cache = None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
                     attention_mask,
                     position_ids,
-                    past_key_value,
+                    past_key_values,
                     output_attentions,
                     use_cache,
                 )
@@ -923,7 +1069,7 @@ def forward(
                     hidden_states,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                 )
@@ -931,7 +1077,7 @@ def forward(
             hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -942,7 +1088,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
@@ -1023,7 +1171,6 @@ def forward(
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
         "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
         ```"""
-
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1081,16 +1228,33 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
             else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
diff --git a/src/transformers/models/llava/__init__.py b/src/transformers/models/llava/__init__.py
new file mode 100644
index 00000000000000..11aedf9476cfaa
--- /dev/null
+++ b/src/transformers/models/llava/__init__.py
@@ -0,0 +1,56 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_llava": ["LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "LlavaConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_llava"] = [
+        "LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "LlavaForConditionalGeneration",
+        "LlavaPreTrainedModel",
+    ]
+    _import_structure["processing_llava"] = ["LlavaProcessor"]
+
+
+if TYPE_CHECKING:
+    from .configuration_llava import LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, LlavaConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_llava import (
+            LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            LlavaForConditionalGeneration,
+            LlavaPreTrainedModel,
+        )
+        from .processing_llava import LlavaProcessor
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py
new file mode 100644
index 00000000000000..1f174bc1b4237e
--- /dev/null
+++ b/src/transformers/models/llava/configuration_llava.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Llava model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "llava-hf/llava-v1.5-7b": "https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json",
+}
+
+
+class LlavaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
+    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llava-9B.
+
+    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`LlavaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the CLIP backbone.
+        vision_feature_layer (`int`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
+
+    Example:
+
+    ```python
+    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Llava llava-1.5-7b style configuration
+    >>> configuration = LlavaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the llava-1.5-7b style configuration
+    >>> model = LlavaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llava"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        vocab_size=32000,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.vocab_size = vocab_size
+
+        self.vision_config = vision_config
+
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vocab_size = self.vocab_size
+
+        self.text_config = text_config
+
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            self.vocab_size = self.text_config.vocab_size
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["llama"]()
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/llava/convert_llava_weights_to_hf.py b/src/transformers/models/llava/convert_llava_weights_to_hf.py
new file mode 100644
index 00000000000000..65b58236db1053
--- /dev/null
+++ b/src/transformers/models/llava/convert_llava_weights_to_hf.py
@@ -0,0 +1,126 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    AddedToken,
+    AutoConfig,
+    AutoTokenizer,
+    CLIPImageProcessor,
+    LlavaConfig,
+    LlavaForConditionalGeneration,
+    LlavaProcessor,
+)
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_tower.": "",
+    "model.mm_projector": "multi_modal_projector",
+    "model": "model.model",
+    "vision_model.model": "vision_model",
+    "lm_head": "language_model.lm_head",
+    "model.model": "language_model.model",
+    "multi_modal_projector.0": "multi_modal_projector.linear_1",
+    "multi_modal_projector.2": "multi_modal_projector.linear_2",
+}
+
+
+def convert_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+
+        new_state_dict[key] = value
+    return new_state_dict
+
+
+def convert_llava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
+    torch.set_default_dtype(torch.float16)
+    text_config = AutoConfig.from_pretrained(text_model_id)
+
+    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
+    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False), special=True)
+    tokenizer.add_special_tokens({"pad_token": "<pad>"})
+
+    image_processor = CLIPImageProcessor.from_pretrained(vision_model_id)
+
+    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+    config = LlavaConfig(text_config=text_config)
+    config.pad_token_id = 32001
+
+    with torch.device("meta"):
+        model = LlavaForConditionalGeneration(config)
+
+    # Pad to 64 for performance reasons
+    pad_shape = 64
+
+    state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict.bin")
+
+    state_dict = torch.load(state_dict_path, map_location="cpu")
+    state_dict = convert_state_dict_to_hf(state_dict)
+    model.load_state_dict(state_dict, strict=True, assign=True)
+
+    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
+    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
+    n = pre_expansion_embeddings.size()[0]
+    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
+    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
+
+    # We add an image token so we resize the model
+    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
+    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
+        dim=0,
+    )
+    model.language_model.lm_head.weight.data[32000:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
+        dim=0,
+    )
+    model.config.vocab_size = model.config.vocab_size + pad_shape
+    model.config.text_config.vocab_size = model.config.text_config.vocab_size + pad_shape
+
+    model.push_to_hub(output_hub_path)
+    processor.push_to_hub(output_hub_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text_model_id",
+        help="Hub location of the text model",
+    )
+    parser.add_argument(
+        "--vision_model_id",
+        help="Hub location of the vision model",
+    )
+    parser.add_argument(
+        "--output_hub_path",
+        help="Location on the hub of the converted model",
+    )
+    parser.add_argument(
+        "--old_state_dict_id",
+        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
+    )
+    args = parser.parse_args()
+    convert_llava_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py
new file mode 100644
index 00000000000000..3a7dbc198e3732
--- /dev/null
+++ b/src/transformers/models/llava/modeling_llava.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Llava model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_outputs import ModelOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_llava import LlavaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "LlavaConfig"
+
+LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-1.5-13b-hf",
+    "llava-hf/bakLlava-v1-hf",
+    # See all Llava models at https://huggingface.co/models?filter=llava
+]
+
+
+@dataclass
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
+class LlavaCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for Llava causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class LlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+
+        self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+LLAVA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`LlavaConfig`] or [`LlavaVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    LLAVA_START_DOCSTRING,
+)
+class LlavaPreTrainedModel(PreTrainedModel):
+    config_class = LlavaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlavaVisionAttention"]
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        # important: this ported version of Llava isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+LLAVA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """The LLAVA model which consists of a vision backbone and a language model.""",
+    LLAVA_START_DOCSTRING,
+)
+class LlavaForConditionalGeneration(LlavaPreTrainedModel):
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.vocab_size = config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids, attention_mask, position_ids
+    ):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        return final_embedding, final_attention_mask, position_ids
+
+    @add_start_docstrings_to_model_forward(LLAVA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[int] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+        >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=text, images=image, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "There seems to be a stop sign"
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1:
+                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
+                selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+
+                if vision_feature_select_strategy == "default":
+                    selected_image_feature = selected_image_feature[:, 1:]
+                elif vision_feature_select_strategy == "full":
+                    selected_image_feature = selected_image_feature
+                else:
+                    raise ValueError(
+                        f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
+                    )
+
+                image_features = self.multi_modal_projector(selected_image_feature)
+                inputs_embeds, attention_mask, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, position_ids
+                )
+                if labels is None:
+                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, 0, :, 0]
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[batch_index, non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+            }
+        )
+        return model_inputs
+
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)
diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py
new file mode 100644
index 00000000000000..1ba1b30e65906b
--- /dev/null
+++ b/src/transformers/models/llava/processing_llava.py
@@ -0,0 +1,136 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Llava.
+"""
+
+
+from typing import List, Optional, Union
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Llava processor which wraps a Llava image processor and a Llava tokenizer into a single processor.
+
+    [`LlavaProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`CLIPImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "CLIPImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None):
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        images: ImageInput = None,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length=None,
+        return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is not None:
+            pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
+        else:
+            pixel_values = None
+        text_inputs = self.tokenizer(
+            text, return_tensors=return_tensors, padding=padding, truncation=truncation, max_length=max_length
+        )
+
+        return BatchFeature(data={**text_inputs, "pixel_values": pixel_values})
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Llama
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/longformer/modeling_longformer.py b/src/transformers/models/longformer/modeling_longformer.py
index 62b7ac4a1dc4fb..40587cebc17697 100755
--- a/src/transformers/models/longformer/modeling_longformer.py
+++ b/src/transformers/models/longformer/modeling_longformer.py
@@ -1598,7 +1598,7 @@ def _pad_to_window_size(
 
         # this path should be recorded in the ONNX export, it is fine with padding_len == 0 as well
         if padding_len > 0:
-            logger.info(
+            logger.warning_once(
                 f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
                 f"`config.attention_window`: {attention_window}"
             )
@@ -1917,7 +1917,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if global_attention_mask is None:
-            logger.info("Initializing global attention on CLS token...")
+            logger.warning_once("Initializing global attention on CLS token...")
             global_attention_mask = torch.zeros_like(input_ids)
             # global attention on cls token
             global_attention_mask[:, 0] = 1
@@ -2270,7 +2270,7 @@ def forward(
 
         # set global attention on question tokens
         if global_attention_mask is None and input_ids is not None:
-            logger.info("Initializing global attention on multiple choice...")
+            logger.warning_once("Initializing global attention on multiple choice...")
             # put global attention on all tokens after `config.sep_token_id`
             global_attention_mask = torch.stack(
                 [
diff --git a/src/transformers/models/longformer/modeling_tf_longformer.py b/src/transformers/models/longformer/modeling_tf_longformer.py
index 0397c2ba320ec5..029983e27f0e0b 100644
--- a/src/transformers/models/longformer/modeling_tf_longformer.py
+++ b/src/transformers/models/longformer/modeling_tf_longformer.py
@@ -2213,7 +2213,7 @@ def call(
                 )
                 global_attention_mask = tf.cast(tf.fill(shape_list(input_ids), value=0), tf.int64)
             else:
-                logger.info("Initializing global attention on question tokens...")
+                logger.warning_once("Initializing global attention on question tokens...")
                 # put global attention on all tokens until `config.sep_token_id` is reached
                 sep_token_indices = tf.where(input_ids == self.config.sep_token_id)
                 sep_token_indices = tf.cast(sep_token_indices, dtype=tf.int64)
@@ -2341,7 +2341,7 @@ def call(
             global_attention_mask = tf.cast(global_attention_mask, tf.int64)
 
         if global_attention_mask is None and input_ids is not None:
-            logger.info("Initializing global attention on CLS token...")
+            logger.warning_once("Initializing global attention on CLS token...")
             # global attention on cls token
             global_attention_mask = tf.zeros_like(input_ids)
             updates = tf.ones(shape_list(input_ids)[0], dtype=tf.int64)
diff --git a/src/transformers/models/m2m_100/modeling_m2m_100.py b/src/transformers/models/m2m_100/modeling_m2m_100.py
index c05948540f7865..656c526536c563 100755
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
@@ -325,9 +325,8 @@ class M2M100EncoderLayer(nn.Module):
     def __init__(self, config: M2M100Config):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = M2M100_ATTENTION_CLASSES[attn_type](
+        self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -392,7 +391,7 @@ def forward(
         return outputs
 
 
-M2M100_ATTENTION_CLASSES = {"default": M2M100Attention}
+M2M100_ATTENTION_CLASSES = {"eager": M2M100Attention}
 
 
 # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->M2M100, MBART->M2M100
@@ -400,9 +399,8 @@ class M2M100DecoderLayer(nn.Module):
     def __init__(self, config: M2M100Config):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = M2M100_ATTENTION_CLASSES[attn_type](
+        self.self_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -415,7 +413,7 @@ def __init__(self, config: M2M100Config):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = M2M100_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = M2M100_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py
index cabf0c68f8b62b..d52a060d4723c8 100755
--- a/src/transformers/models/marian/modeling_marian.py
+++ b/src/transformers/models/marian/modeling_marian.py
@@ -272,9 +272,8 @@ class MarianEncoderLayer(nn.Module):
     def __init__(self, config: MarianConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = MARIAN_ATTENTION_CLASSES[attn_type](
+        self.self_attn = MARIAN_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -339,7 +338,7 @@ def forward(
         return outputs
 
 
-MARIAN_ATTENTION_CLASSES = {"default": MarianAttention}
+MARIAN_ATTENTION_CLASSES = {"eager": MarianAttention}
 
 
 # Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->Marian, BART->MARIAN
@@ -348,8 +347,7 @@ def __init__(self, config: MarianConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.self_attn = MARIAN_ATTENTION_CLASSES[attn_type](
+        self.self_attn = MARIAN_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -362,7 +360,7 @@ def __init__(self, config: MarianConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = MARIAN_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = MARIAN_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/mask2former/modeling_mask2former.py b/src/transformers/models/mask2former/modeling_mask2former.py
index 08d43bd2960470..5a5dacbcf4c120 100644
--- a/src/transformers/models/mask2former/modeling_mask2former.py
+++ b/src/transformers/models/mask2former/modeling_mask2former.py
@@ -471,6 +471,9 @@ def forward(
             cost_dice = pair_wise_dice_loss(pred_mask, target_mask)
             # final cost matrix
             cost_matrix = self.cost_mask * cost_mask + self.cost_class * cost_class + self.cost_dice * cost_dice
+            # eliminate infinite values in cost_matrix to avoid the error ``ValueError: cost matrix is infeasible``
+            cost_matrix = torch.minimum(cost_matrix, torch.tensor(1e10))
+            cost_matrix = torch.maximum(cost_matrix, torch.tensor(-1e10))
             # do the assigmented using the hungarian algorithm in scipy
             assigned_indices: Tuple[np.array] = linear_sum_assignment(cost_matrix.cpu())
             indices.append(assigned_indices)
diff --git a/src/transformers/models/mbart/modeling_mbart.py b/src/transformers/models/mbart/modeling_mbart.py
index 97fdf9ed87998b..3d25d75b3ef28f 100755
--- a/src/transformers/models/mbart/modeling_mbart.py
+++ b/src/transformers/models/mbart/modeling_mbart.py
@@ -41,6 +41,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -289,6 +290,15 @@ class MBartFlashAttention2(MBartAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
 
@@ -413,6 +423,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -433,13 +449,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -485,7 +501,7 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
 
 
 MBART_ATTENTION_CLASSES = {
-    "default": MBartAttention,
+    "eager": MBartAttention,
     "flash_attention_2": MBartFlashAttention2,
 }
 
@@ -494,9 +510,8 @@ class MBartEncoderLayer(nn.Module):
     def __init__(self, config: MBartConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = MBART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -565,9 +580,8 @@ class MBartDecoderLayer(nn.Module):
     def __init__(self, config: MBartConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = MBART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -580,7 +594,7 @@ def __init__(self, config: MBartConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = MBART_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = MBART_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -919,6 +933,7 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
             embed_dim,
         )
         self.layers = nn.ModuleList([MBartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
         self.layer_norm = nn.LayerNorm(config.d_model)
 
@@ -1007,7 +1022,7 @@ def forward(
         # expand attention_mask
         if attention_mask is not None:
             # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
@@ -1096,6 +1111,7 @@ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = N
             config.d_model,
         )
         self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
         self.layer_norm = nn.LayerNorm(config.d_model)
 
@@ -1215,7 +1231,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         else:
@@ -1226,7 +1242,7 @@ def forward(
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
diff --git a/src/transformers/models/mistral/modeling_mistral.py b/src/transformers/models/mistral/modeling_mistral.py
index 3aefb03d8c6dad..29af7c0e88e979 100644
--- a/src/transformers/models/mistral/modeling_mistral.py
+++ b/src/transformers/models/mistral/modeling_mistral.py
@@ -30,6 +30,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
@@ -37,6 +38,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -106,7 +108,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -194,9 +196,17 @@ class MistralAttention(nn.Module):
     and "Generating Long Sequences with Sparse Transformers".
     """
 
-    def __init__(self, config: MistralConfig):
+    def __init__(self, config: MistralConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
@@ -231,7 +241,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         **kwargs,
@@ -252,16 +262,19 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -312,12 +325,21 @@ class MistralFlashAttention2(MistralAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
         **kwargs,
@@ -341,7 +363,7 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
 
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
@@ -351,7 +373,7 @@ def forward(
 
         use_sliding_windows = (
             _flash_supports_window_size
-            and hasattr(self.config, "sliding_window") is not None
+            and getattr(self.config, "sliding_window", None) is not None
             and kv_seq_len > self.config.sliding_window
         )
 
@@ -363,8 +385,8 @@ def forward(
 
         if past_key_value is not None:
             # Activate slicing cache only if the config has a value `sliding_windows` attribute
-            if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window:
-                slicing_tokens = kv_seq_len - self.config.sliding_window
+            if getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window:
+                slicing_tokens = 1 - self.config.sliding_window
 
                 past_key = past_key_value[0]
                 past_value = past_key_value[1]
@@ -384,10 +406,8 @@ def forward(
                     attention_mask = attention_mask[:, slicing_tokens:]
                     attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
 
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         # repeat k/v heads if n_kv_heads < n_heads
         key_states = repeat_kv(key_states, self.num_key_value_groups)
@@ -470,6 +490,12 @@ def _flash_attention_forward(
             use_sliding_windows (`bool`, *optional*):
                 Whether to activate sliding window attention.
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -491,7 +517,7 @@ def _flash_attention_forward(
                     max_seqlen_k=max_seqlen_in_batch_k,
                     dropout_p=dropout,
                     softmax_scale=softmax_scale,
-                    causal=self.is_causal,
+                    causal=causal,
                 )
             else:
                 attn_output_unpad = flash_attn_varlen_func(
@@ -504,7 +530,7 @@ def _flash_attention_forward(
                     max_seqlen_k=max_seqlen_in_batch_k,
                     dropout_p=dropout,
                     softmax_scale=softmax_scale,
-                    causal=self.is_causal,
+                    causal=causal,
                     window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
 
@@ -517,7 +543,7 @@ def _flash_attention_forward(
                     value_states,
                     dropout,
                     softmax_scale=softmax_scale,
-                    causal=self.is_causal,
+                    causal=causal,
                 )
             else:
                 attn_output = flash_attn_func(
@@ -526,7 +552,7 @@ def _flash_attention_forward(
                     value_states,
                     dropout,
                     softmax_scale=softmax_scale,
-                    causal=self.is_causal,
+                    causal=causal,
                     window_size=(self.config.sliding_window, self.config.sliding_window),
                 )
 
@@ -575,15 +601,19 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MistralAttention,
+    "flash_attention_2": MistralFlashAttention2,
+}
+
+
 class MistralDecoderLayer(nn.Module):
-    def __init__(self, config: MistralConfig):
+    def __init__(self, config: MistralConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = (
-            MistralAttention(config=config)
-            if not getattr(config, "_flash_attn_2_enabled", False)
-            else MistralFlashAttention2(config)
-        )
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
         self.mlp = MistralMLP(config)
         self.input_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
@@ -676,6 +706,7 @@ class MistralPreTrainedModel(PreTrainedModel):
     _no_split_modules = ["MistralDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
     _supports_flash_attn_2 = True
+    _supports_cache_class = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -724,17 +755,23 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -771,7 +808,10 @@ def __init__(self, config: MistralConfig):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([MistralDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList(
+            [MistralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
         self.norm = MistralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
         self.gradient_checkpointing = False
@@ -815,12 +855,13 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-        seq_length_with_past = seq_length
         past_key_values_length = 0
 
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
 
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -834,12 +875,7 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if (
-            attention_mask is not None
-            and hasattr(self.config, "_flash_attn_2_enabled")
-            and self.config._flash_attn_2_enabled
-            and past_key_values is not None
-        ):
+        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
             is_padding_right = attention_mask[:, -1].sum().item() != batch_size
             if is_padding_right:
                 raise ValueError(
@@ -848,7 +884,7 @@ def forward(
                     " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
                 )
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
         else:
@@ -873,21 +909,19 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        next_decoder_cache = None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
                     attention_mask,
                     position_ids,
-                    past_key_value,
+                    past_key_values,
                     output_attentions,
                     use_cache,
                 )
@@ -896,7 +930,7 @@ def forward(
                     hidden_states,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                 )
@@ -904,7 +938,7 @@ def forward(
             hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -915,7 +949,10 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
@@ -1049,17 +1086,34 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         # Omit tokens covered by past_key_values
-        if past_key_values:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
             else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
diff --git a/src/transformers/models/mixtral/__init__.py b/src/transformers/models/mixtral/__init__.py
new file mode 100644
index 00000000000000..ebde04ea4ae81c
--- /dev/null
+++ b/src/transformers/models/mixtral/__init__.py
@@ -0,0 +1,62 @@
+# Copyright 2023 Mixtral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_mixtral": ["MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP", "MixtralConfig"],
+}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_mixtral"] = [
+        "MixtralForCausalLM",
+        "MixtralModel",
+        "MixtralPreTrainedModel",
+        "MixtralForSequenceClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_mixtral import MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP, MixtralConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_mixtral import (
+            MixtralForCausalLM,
+            MixtralForSequenceClassification,
+            MixtralModel,
+            MixtralPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/mixtral/configuration_mixtral.py b/src/transformers/models/mixtral/configuration_mixtral.py
new file mode 100644
index 00000000000000..dc547068e02f42
--- /dev/null
+++ b/src/transformers/models/mixtral/configuration_mixtral.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 Mixtral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Mixtral model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+MIXTRAL_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "mistral-ai/Mixtral-8x7B": "https://huggingface.co/mistral-ai/Mixtral-8x7B/resolve/main/config.json",
+}
+
+
+class MixtralConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MixtralModel`]. It is used to instantiate an
+    Mixtral model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Mixtral-7B-v0.1 or Mixtral-7B-Instruct-v0.1.
+
+    [mixtralai/Mixtral-8x7B](https://huggingface.co/mixtralai/Mixtral-8x7B)
+    [mixtralai/Mixtral-7B-Instruct-v0.1](https://huggingface.co/mixtralai/Mixtral-7B-Instruct-v0.1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Mixtral model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MixtralModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 14336):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 8):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to `4096*32`):
+            The maximum sequence length that this model might ever be used with. Mixtral's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied.
+        rope_theta (`float`, *optional*, defaults to 1000000.0):
+            The base period of the RoPE embeddings.
+        sliding_window (`int`, *optional*, defaults to 4096):
+            Sliding window attention window size. If not specified, will default to `4096`.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts_per_tok (`int`, *optional*, defaults to 2):
+            The number of experts to root per-token, can be also interpreted as the `top-p` routing
+            parameter
+        num_local_experts (`int`, *optional*, defaults to 8):
+            Number of experts per Sparse MLP layer.
+        output_router_logits (`bool`, *optional*, defaults to `False`):
+            Whether or not the router logits should be returned by the model. Enabeling this will also
+            allow the model to output the auxiliary loss. See [here]() for more details
+        router_aux_loss_coef (`float`, *optional*, defaults to 0.001):
+            The aux loss factor for the total loss.
+
+    ```python
+    >>> from transformers import MixtralModel, MixtralConfig
+
+    >>> # Initializing a Mixtral 7B style configuration
+    >>> configuration = MixtralConfig()
+
+    >>> # Initializing a model from the Mixtral 7B style configuration
+    >>> model = MixtralModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mixtral"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=4096,
+        intermediate_size=14336,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=8,
+        hidden_act="silu",
+        max_position_embeddings=4096 * 32,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=1e6,
+        sliding_window=4096,
+        attention_dropout=0.0,
+        num_experts_per_tok=2,
+        num_local_experts=8,
+        output_router_logits=False,
+        router_aux_loss_coef=0.001,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_dropout = attention_dropout
+
+        self.num_experts_per_tok = num_experts_per_tok
+        self.num_local_experts = num_local_experts
+        self.output_router_logits = output_router_logits
+        self.router_aux_loss_coef = router_aux_loss_coef
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
diff --git a/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
new file mode 100644
index 00000000000000..53cb8014438165
--- /dev/null
+++ b/src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py
@@ -0,0 +1,244 @@
+# Copyright 2023 Mistral AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+
+import torch
+
+from transformers import (
+    MixtralConfig,
+    MixtralForCausalLM,
+)
+
+
+"""
+Sample usage:
+
+```
+python src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py \
+    --input_dir /path/to/downloaded/mixtral/weights --model_size 7B --output_dir /output/path
+```
+
+Thereafter, models can be loaded via:
+
+```py
+from transformers import MixtralForCausalLM
+
+model = MixtralForCausalLM.from_pretrained("/output/path")
+```
+
+Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
+come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
+"""
+
+
+def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256):
+    return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of)
+
+
+def read_json(path):
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def write_json(text, path):
+    with open(path, "w") as f:
+        json.dump(text, f)
+
+
+def write_model(model_path, input_base_path, model_size, safe_serialization=True):
+    os.makedirs(model_path, exist_ok=True)
+
+    params = read_json(os.path.join(input_base_path, "params.json"))
+    num_shards = 1
+
+    # For some reason this is a string in the params.json
+    sliding_window = int(params["sliding_window"])
+    n_layers = params["num_hidden_layers"]
+    n_heads = params["num_attention_heads"]
+    n_heads_per_shard = n_heads // num_shards
+    dim = params["hidden_size"]
+    dims_per_head = dim // n_heads
+    base = params.get("rope_theta", 10000.0)
+    max_position_embeddings = 4096 * 8
+    num_local_experts = params["num_local_experts"]
+    ffn_dim = params["intermediate_size"]
+
+    vocab_size = params["vocab_size"]
+
+    if "num_key_value_heads" in params:
+        num_key_value_heads = params["num_key_value_heads"]  # for GQA / MQA
+        num_local_key_value_heads = num_key_value_heads // num_shards
+        key_value_dim = dims_per_head * num_local_key_value_heads
+    else:  # compatibility with other checkpoints
+        num_key_value_heads = n_heads
+        num_local_key_value_heads = n_heads_per_shard
+        key_value_dim = dim
+
+    # permute for sliced rotary
+    def permute(w, n_heads=n_heads, dim1=dim, dim2=dim):
+        return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+    print(f"Fetching all parameters from the checkpoint at {input_base_path}.")
+    # Load weights
+    loaded = [
+        torch.load(os.path.join(input_base_path, f"consolidated.{i:02d}.pt"), map_location="cpu") for i in range(8)
+    ]
+
+    merged_state_dict = {}
+    for state_dict in loaded:
+        merged_state_dict.update(state_dict)
+
+    state_dict = {}
+
+    for layer_i in range(n_layers):
+        # Sharded
+        # Note that attention.w{q,k,v,o}, feed_fordward.w[1,2,3], attention_norm.weight and ffn_norm.weight share
+        # the same storage object, saving attention_norm and ffn_norm will save other weights too, which is
+        # redundant as other weights will be stitched from multiple shards. To avoid that, they are cloned.
+
+        state_dict.update(
+            {
+                f"model.layers.{layer_i}.input_layernorm.weight": merged_state_dict[
+                    f"layers.{layer_i}.attention_norm.weight"
+                ].clone(),
+                f"model.layers.{layer_i}.post_attention_layernorm.weight": merged_state_dict[
+                    f"layers.{layer_i}.ffn_norm.weight"
+                ].clone(),
+            }
+        )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute(
+            merged_state_dict[f"layers.{layer_i}.attention.wq.weight"]
+            .view(n_heads_per_shard, dims_per_head, dim)
+            .reshape(dim, dim)
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute(
+            merged_state_dict[f"layers.{layer_i}.attention.wk.weight"]
+            .view(num_local_key_value_heads, dims_per_head, dim)
+            .reshape(key_value_dim, dim),
+            num_key_value_heads,
+            key_value_dim,
+            dim,
+        )
+        state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = (
+            merged_state_dict[f"layers.{layer_i}.attention.wv.weight"]
+            .view(num_local_key_value_heads, dims_per_head, dim)
+            .reshape(key_value_dim, dim)
+        )
+
+        state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = merged_state_dict[
+            f"layers.{layer_i}.attention.wo.weight"
+        ]
+
+        w1 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w1"]
+        w2 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w2"]
+        w3 = merged_state_dict[f"layers.{layer_i}.block_sparse_moe.w3"]
+
+        experts_w1 = [
+            w1[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
+            for expert_idx in range(num_local_experts)
+        ]
+
+        for idx, expert_block in enumerate(experts_w1):
+            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w1"
+            state_dict[expert_key + ".weight"] = expert_block.clone()
+
+        experts_w2 = [
+            w2[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
+            for expert_idx in range(num_local_experts)
+        ]
+
+        for idx, expert_block in enumerate(experts_w2):
+            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w2"
+            state_dict[expert_key + ".weight"] = expert_block.T.clone().contiguous()
+
+        experts_w3 = [
+            w3[ffn_dim * expert_idx : ffn_dim * (expert_idx + 1), :].contiguous().clone()
+            for expert_idx in range(num_local_experts)
+        ]
+
+        for idx, expert_block in enumerate(experts_w3):
+            expert_key = f"model.layers.{layer_i}.block_sparse_moe.experts.{idx}.w3"
+            state_dict[expert_key + ".weight"] = expert_block.clone()
+
+        state_dict[f"model.layers.{layer_i}.block_sparse_moe.gate.weight"] = merged_state_dict[
+            f"layers.{layer_i}.block_sparse_moe.gate.weight"
+        ]
+
+    state_dict.update(
+        {
+            "model.norm.weight": merged_state_dict["norm.weight"],
+            "model.embed_tokens.weight": merged_state_dict["tok_embeddings.weight"],
+            "lm_head.weight": merged_state_dict["output.weight"],
+        }
+    )
+
+    config = MixtralConfig(
+        hidden_size=dim,
+        intermediate_size=ffn_dim,
+        num_attention_heads=params["num_attention_heads"],
+        num_hidden_layers=params["num_hidden_layers"],
+        rms_norm_eps=params["rms_norm_eps"],
+        num_key_value_heads=num_key_value_heads,
+        vocab_size=vocab_size,
+        rope_theta=base,
+        max_position_embeddings=max_position_embeddings,
+        sliding_window=sliding_window,
+        num_local_experts=num_local_experts,
+    )
+
+    print("Loading the checkpoint in a Mixtral model.")
+    with torch.device("meta"):
+        model = MixtralForCausalLM(config)
+    # Avoid saving this as part of the config.
+    del model.config._name_or_path
+    model.config.torch_dtype = torch.float16
+    print("Saving in the Transformers format.")
+
+    model.load_state_dict(state_dict, strict=True, assign=True)
+
+    for n, p in model.named_parameters():
+        assert p.device.type != "meta", f"{n} has not been loaded!"
+
+    model.save_pretrained(model_path, safe_serialization=safe_serialization)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_dir",
+        help="Location of Mixtral weights, which contains tokenizer.model and model folders",
+        required=True,
+    )
+    parser.add_argument(
+        "--model_size",
+        choices=["7B"],
+        help="'f' models correspond to the finetuned versions, and are specific to the Mixtral official release. For more details on Mixtral, checkout the original repo: https://huggingface.co/mistral-ai",
+        default="7B",
+    )
+    parser.add_argument("--output_dir", help="Location to write HF model", required=True)
+    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
+    args = parser.parse_args()
+    write_model(
+        model_path=args.output_dir,
+        input_base_path=args.input_dir,
+        model_size=args.model_size,
+        safe_serialization=args.safe_serialization,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py
new file mode 100644
index 00000000000000..7268673441fe87
--- /dev/null
+++ b/src/transformers/models/mixtral/modeling_mixtral.py
@@ -0,0 +1,1454 @@
+# coding=utf-8
+# Copyright 2023 Mistral AI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Mixtral model."""
+import inspect
+import math
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_causal_attention_mask,
+)
+from ...modeling_outputs import (
+    MoeCausalLMOutputWithPast,
+    MoeModelOutputWithPast,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import is_torch_greater_or_equal_than_1_13
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
+    logging,
+    replace_return_docstrings,
+)
+from ...utils.import_utils import is_torch_fx_available
+from .configuration_mixtral import MixtralConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+    _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
+
+# This makes `_prepare_4d_causal_attention_mask` a leaf function in the FX graph.
+# It means that the function will not be traced through and simply appear as a node in the graph.
+if is_torch_fx_available():
+    if not is_torch_greater_or_equal_than_1_13:
+        import torch.fx
+
+    _prepare_4d_causal_attention_mask = torch.fx.wrap(_prepare_4d_causal_attention_mask)
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MixtralConfig"
+
+
+def load_balancing_loss_func(gate_logits: torch.Tensor, num_experts: torch.Tensor = None, top_k=2) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://arxiv.org/abs/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        gate_logits (Union[`torch.Tensor`, Tuple[torch.Tensor]):
+            Logits from the `gate`, should be a tuple of tensors. Shape: [batch_size, seqeunce_length, num_experts].
+        num_experts (`int`, *optional*):
+            Number of experts
+
+    Returns:
+        The auxiliary loss.
+    """
+    if gate_logits is None:
+        return 0
+
+    if isinstance(gate_logits, tuple):
+        # cat along the layers?
+        compute_device = gate_logits[0].device
+        gate_logits = torch.cat([gate.to(compute_device) for gate in gate_logits], dim=0)
+
+    routing_weights, selected_experts = torch.topk(gate_logits, top_k, dim=-1)
+    routing_weights = routing_weights.softmax(dim=-1)
+
+    # cast the expert indices to int64, otherwise one-hot encoding will fail
+    if selected_experts.dtype != torch.int64:
+        selected_experts = selected_experts.to(torch.int64)
+
+    if len(selected_experts.shape) == 2:
+        selected_experts = selected_experts.unsqueeze(2)
+
+    expert_mask = torch.nn.functional.one_hot(selected_experts, num_experts)
+
+    # For a given token, determine if it was routed to a given expert.
+    expert_mask = torch.max(expert_mask, axis=-2).values
+
+    # cast to float32 otherwise mean will fail
+    expert_mask = expert_mask.to(torch.float32)
+    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
+
+    router_prob_per_group_and_expert = torch.mean(routing_weights, axis=-1)
+    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert.unsqueeze(-1)) * (num_experts**2)
+
+
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Mixtral
+class MixtralRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        MixtralRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Mixtral
+class MixtralRotaryEmbedding(nn.Module):
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralAttention with Mistral->Mixtral
+class MixtralAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+    """
+
+    def __init__(self, config: MixtralConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+        self.rotary_emb = MixtralRotaryEmbedding(
+            self.head_dim,
+            max_position_embeddings=self.max_position_embeddings,
+            base=self.rope_theta,
+        )
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                )
+
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with Mistral->Mixtral
+class MixtralFlashAttention2(MixtralAttention):
+    """
+    Mixtral flash attention module. This module inherits from `MixtralAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+            # overwrite attention_mask with padding_mask
+            attention_mask = kwargs.pop("padding_mask")
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+
+        # Because the input can be padded, the absolute sequence length depends on the max position id.
+        rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
+
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+
+        use_sliding_windows = (
+            _flash_supports_window_size
+            and getattr(self.config, "sliding_window", None) is not None
+            and kv_seq_len > self.config.sliding_window
+        )
+
+        if not _flash_supports_window_size:
+            logger.warning_once(
+                "The current flash attention version does not support sliding window attention, for a more memory efficient implementation"
+                " make sure to upgrade flash-attn library."
+            )
+
+        if past_key_value is not None:
+            # Activate slicing cache only if the config has a value `sliding_windows` attribute
+            if getattr(self.config, "sliding_window", None) is not None and kv_seq_len > self.config.sliding_window:
+                slicing_tokens = 1 - self.config.sliding_window
+
+                past_key = past_key_value[0]
+                past_value = past_key_value[1]
+
+                past_key = past_key[:, :, slicing_tokens:, :].contiguous()
+                past_value = past_value[:, :, slicing_tokens:, :].contiguous()
+
+                if past_key.shape[-2] != self.config.sliding_window - 1:
+                    raise ValueError(
+                        f"past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1, head_dim`), got"
+                        f" {past_key.shape}"
+                    )
+
+                past_key_value = (past_key, past_value)
+
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, slicing_tokens:]
+                    attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1)
+
+            cache_kwargs = {"sin": sin, "cos": cos}  # Specific to RoPE models
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # repeat k/v heads if n_kv_heads < n_heads
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        dropout_rate = 0.0 if not self.training else self.attention_dropout
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        # Reashape to the expected shape for Flash Attention
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        attn_output = self._flash_attention_forward(
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            q_len,
+            dropout=dropout_rate,
+            use_sliding_windows=use_sliding_windows,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    def _flash_attention_forward(
+        self,
+        query_states,
+        key_states,
+        value_states,
+        attention_mask,
+        query_length,
+        dropout=0.0,
+        softmax_scale=None,
+        use_sliding_windows=False,
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+            use_sliding_windows (`bool`, *optional*):
+                Whether to activate sliding window attention.
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            if not use_sliding_windows:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output_unpad = flash_attn_varlen_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    cu_seqlens_q=cu_seqlens_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_q=max_seqlen_in_batch_q,
+                    max_seqlen_k=max_seqlen_in_batch_k,
+                    dropout_p=dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            if not use_sliding_windows:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                )
+            else:
+                attn_output = flash_attn_func(
+                    query_states,
+                    key_states,
+                    value_states,
+                    dropout,
+                    softmax_scale=softmax_scale,
+                    causal=causal,
+                    window_size=(self.config.sliding_window, self.config.sliding_window),
+                )
+
+        return attn_output
+
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape
+
+        # On the first iteration we need to properly re-create the padding mask
+        # by slicing it on the proper place
+        if kv_seq_len != attention_mask.shape[-1]:
+            attention_mask_num_tokens = attention_mask.shape[-1]
+            attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :]
+
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+
+        key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+        value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k)
+
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+class MixtralBLockSparseTop2MLP(nn.Module):
+    def __init__(self, config: MixtralConfig):
+        super().__init__()
+        self.ffn_dim = config.intermediate_size
+        self.hidden_dim = config.hidden_size
+
+        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
+        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)
+
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states):
+        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(hidden_states)
+        current_hidden_states = self.w2(current_hidden_states)
+        return current_hidden_states
+
+
+MISTRAL_ATTENTION_CLASSES = {
+    "eager": MixtralAttention,
+    "flash_attention_2": MixtralFlashAttention2,
+}
+
+
+class MixtralSparseMoeBlock(nn.Module):
+    """
+    This implementation is
+    strictly equivalent to standard MoE with full capacity (no
+    dropped tokens). It's faster since it formulates MoE operations
+    in terms of block-sparse operations to accomodate imbalanced
+    assignments of tokens to experts, whereas standard MoE either
+    (1) drop tokens at the cost of reduced performance or (2) set
+    capacity factor to number of experts and thus waste computation
+    and memory on padding.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+
+        # gating
+        self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
+
+        self.experts = nn.ModuleList([MixtralBLockSparseTop2MLP(config) for _ in range(self.num_experts)])
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """ """
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            if top_x.shape[0] == 0:
+                continue
+
+            # in torch it is faster to index using lists than torch tensors
+            top_x_list = top_x.tolist()
+            idx_list = idx.tolist()
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states, router_logits
+
+
+class MixtralDecoderLayer(nn.Module):
+    def __init__(self, config: MixtralConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MISTRAL_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx)
+
+        self.block_sparse_moe = MixtralSparseMoeBlock(config)
+        self.input_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        output_router_logits: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_router_logits (`bool`, *optional*):
+                Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+                should not be returned during inference.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_value=past_key_value,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states, router_logits = self.block_sparse_moe(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        if output_router_logits:
+            outputs += (router_logits,)
+
+        return outputs
+
+
+MIXTRAL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MixtralConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
+    MIXTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.mistral.modeling_mistral.MistralPreTrainedModel with Mistral->Mixtral
+class MixtralPreTrainedModel(PreTrainedModel):
+    config_class = MixtralConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MixtralDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+MIXTRAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_router_logits (`bool`, *optional*):
+            Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
+            should not be returned during inference.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Mixtral Model outputting raw hidden-states without any specific head on top.",
+    MIXTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.mistral.modeling_mistral.MistralModel with MISTRAL->MIXTRAL,Mistral->Mixtral
+class MixtralModel(MixtralPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MixtralDecoderLayer`]
+
+    Args:
+        config: MixtralConfig
+    """
+
+    def __init__(self, config: MixtralConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [MixtralDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self.norm = MixtralRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    # Ignore copy
+    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoeModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        past_key_values_length = 0
+
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
+        else:
+            position_ids = position_ids.view(-1, seq_length).long()
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if attention_mask is not None and self._use_flash_attention_2 and use_cache:
+            is_padding_right = attention_mask[:, -1].sum().item() != batch_size
+            if is_padding_right:
+                raise ValueError(
+                    "You are attempting to perform batched generation with padding_side='right'"
+                    " this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
+                    " call `tokenizer.padding_side  = 'left'` before tokenizing the input. "
+                )
+
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask,
+                (batch_size, seq_length),
+                inputs_embeds,
+                past_key_values_length,
+                sliding_window=self.config.sliding_window,
+            )
+
+        hidden_states = inputs_embeds
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_router_logits = () if output_router_logits else None
+        next_decoder_cache = None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    position_ids,
+                    past_key_values,
+                    output_attentions,
+                    output_router_logits,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    position_ids=position_ids,
+                    past_key_value=past_key_values,
+                    output_attentions=output_attentions,
+                    output_router_logits=output_router_logits,
+                    use_cache=use_cache,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+            if output_router_logits:
+                all_router_logits += (layer_outputs[-1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_router_logits]
+                if v is not None
+            )
+        return MoeModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            router_logits=all_router_logits,
+        )
+
+
+class MixtralForCausalLM(MixtralPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MixtralModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        self.router_aux_loss_coef = config.router_aux_loss_coef
+        self.num_experts = config.num_local_experts
+        self.num_experts_per_tok = config.num_experts_per_tok
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def set_decoder(self, decoder):
+        self.model = decoder
+
+    def get_decoder(self):
+        return self.model
+
+    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MixtralForCausalLM
+
+        >>> model = MixtralForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
+        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_router_logits = (
+            output_router_logits if output_router_logits is not None else self.config.output_router_logits
+        )
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            output_router_logits=output_router_logits,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        logits = logits.float()
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        aux_loss = None
+        if output_router_logits:
+            aux_loss = load_balancing_loss_func(
+                outputs.router_logits if return_dict else outputs[-1], self.num_experts, self.num_experts_per_tok
+            )
+            if labels is not None:
+                loss += self.router_aux_loss_coef * aux_loss
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            if output_router_logits:
+                output = (aux_loss,) + output
+            return (loss,) + output if loss is not None else output
+
+        return MoeCausalLMOutputWithPast(
+            loss=loss,
+            aux_loss=aux_loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_logits,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        # Omit tokens covered by past_key_values
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The Mixtral Model transformer with a sequence classification head on top (linear layer).
+
+    [`MixtralForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    MIXTRAL_START_DOCSTRING,
+)
+# Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->Mixtral, LLAMA->MIXTRAL
+class MixtralForSequenceClassification(MixtralPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = MixtralModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(MIXTRAL_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1).to(
+                    logits.device
+                )
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
diff --git a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
index bc508a47984e2e..ecf9b65c2ca1d3 100644
--- a/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
+++ b/src/transformers/models/mobilebert/modeling_tf_mobilebert.py
@@ -1088,6 +1088,12 @@ def call(
             attentions=outputs.attentions,
         )
 
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "cls.predictions.decoder.weight":
+            return tf_weight, "mobilebert.embeddings.word_embeddings.weight"
+        else:
+            return (tf_weight,)
+
 
 @add_start_docstrings("""MobileBert Model with a `language modeling` head on top.""", MOBILEBERT_START_DOCSTRING)
 class TFMobileBertForMaskedLM(TFMobileBertPreTrainedModel, TFMaskedLanguageModelingLoss):
@@ -1168,6 +1174,12 @@ def call(
             attentions=outputs.attentions,
         )
 
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "cls.predictions.decoder.weight":
+            return tf_weight, "mobilebert.embeddings.word_embeddings.weight"
+        else:
+            return (tf_weight,)
+
 
 class TFMobileBertOnlyNSPHead(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
diff --git a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
index 824e4be5e82e7a..73bb296d7ed144 100644
--- a/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
+++ b/src/transformers/models/mobilenet_v1/image_processing_mobilenet_v1.py
@@ -79,10 +79,6 @@ class MobileNetV1ImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -99,12 +95,11 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
         crop_size = get_size_dict(crop_size)
         self.do_resize = do_resize
@@ -117,7 +112,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
@@ -145,13 +139,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -231,7 +231,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, default_to_square=self.use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
index 99791efe04c6ad..aa97d854d7f47a 100644
--- a/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
+++ b/src/transformers/models/mobilenet_v2/image_processing_mobilenet_v2.py
@@ -83,10 +83,6 @@ class MobileNetV2ImageProcessor(BaseImageProcessor):
         image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
             Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
             number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -103,12 +99,11 @@ def __init__(
         do_normalize: bool = True,
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 256}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
         crop_size = get_size_dict(crop_size, param_name="crop_size")
         self.do_resize = do_resize
@@ -121,7 +116,6 @@ def __init__(
         self.do_normalize = do_normalize
         self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
         self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize
     def resize(
@@ -149,13 +143,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -235,7 +235,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, default_to_square=self.use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/mobilevit/image_processing_mobilevit.py b/src/transformers/models/mobilevit/image_processing_mobilevit.py
index 79eaeac9007b6f..ee16d439cb6b05 100644
--- a/src/transformers/models/mobilevit/image_processing_mobilevit.py
+++ b/src/transformers/models/mobilevit/image_processing_mobilevit.py
@@ -78,10 +78,6 @@ class MobileViTImageProcessor(BaseImageProcessor):
         do_flip_channel_order (`bool`, *optional*, defaults to `True`):
             Whether to flip the color channels from RGB to BGR. Can be overridden by the `do_flip_channel_order`
             parameter in the `preprocess` method.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -96,12 +92,11 @@ def __init__(
         do_center_crop: bool = True,
         crop_size: Dict[str, int] = None,
         do_flip_channel_order: bool = True,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 256, "width": 256}
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
@@ -113,7 +108,6 @@ def __init__(
         self.do_center_crop = do_center_crop
         self.crop_size = crop_size
         self.do_flip_channel_order = do_flip_channel_order
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.mobilenet_v1.image_processing_mobilenet_v1.MobileNetV1ImageProcessor.resize with PILImageResampling.BICUBIC->PILImageResampling.BILINEAR
     def resize(
@@ -141,13 +135,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -246,7 +246,7 @@ def preprocess(
         )
 
         size = size if size is not None else self.size
-        size = get_size_dict(size, default_to_square=self.use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else self.crop_size
         crop_size = get_size_dict(crop_size, param_name="crop_size")
 
diff --git a/src/transformers/models/mpt/modeling_mpt.py b/src/transformers/models/mpt/modeling_mpt.py
index e2d12f0e777ee7..ae8e7e9031129f 100644
--- a/src/transformers/models/mpt/modeling_mpt.py
+++ b/src/transformers/models/mpt/modeling_mpt.py
@@ -265,7 +265,7 @@ def _init_weights(self, module: nn.Module):
 
     @staticmethod
     def _convert_to_mpt_cache(
-        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]]
+        past_key_value: Tuple[Tuple[torch.Tensor, torch.Tensor]],
     ) -> Tuple[Tuple[torch.Tensor, torch.Tensor]]:
         """
         Converts the cache to the format expected by Mpt, i.e. to tuple(tuple([batch_size * num_heads, ...]))
diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py
index aab93711dfc602..9464979a2b8e46 100644
--- a/src/transformers/models/mt5/configuration_mt5.py
+++ b/src/transformers/models/mt5/configuration_mt5.py
@@ -40,8 +40,8 @@ class MT5Config(PretrainedConfig):
         d_model (`int`, *optional*, defaults to 512):
             Size of the encoder layers and the pooler layer.
         d_kv (`int`, *optional*, defaults to 64):
-            Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model //
-            num_heads`.
+            Size of the key, query, value projections per attention head. In the conventional context, it is typically expected that `d_kv` has to be equal to `d_model // num_heads`.
+            But in the architecture of mt5-small, `d_kv` is not equal to `d_model //num_heads`. The `inner_dim` of the projection layer will be defined as `num_heads * d_kv`.
         d_ff (`int`, *optional*, defaults to 1024):
             Size of the intermediate feed forward layer in each `T5Block`.
         num_layers (`int`, *optional*, defaults to 8):
diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py
index 584b29e623134a..8cca8108efd0b8 100644
--- a/src/transformers/models/musicgen/modeling_musicgen.py
+++ b/src/transformers/models/musicgen/modeling_musicgen.py
@@ -1869,7 +1869,7 @@ def forward(
                     "disabled by setting `chunk_length=None` in the audio encoder."
                 )
 
-            if self.config.audio_channels == 2 and audio_codes.shape[2] == self.decoder.num_codebooks // 2:
+            if self.config.decoder.audio_channels == 2 and audio_codes.shape[2] == self.decoder.num_codebooks // 2:
                 # mono input through encodec that we convert to stereo
                 audio_codes = audio_codes.repeat_interleave(2, dim=2)
 
diff --git a/src/transformers/models/oneformer/image_processing_oneformer.py b/src/transformers/models/oneformer/image_processing_oneformer.py
index da060fa0514fab..c42001a96252f2 100644
--- a/src/transformers/models/oneformer/image_processing_oneformer.py
+++ b/src/transformers/models/oneformer/image_processing_oneformer.py
@@ -1167,6 +1167,7 @@ def post_process_instance_segmentation(
         class_queries_logits = outputs.class_queries_logits  # [batch_size, num_queries, num_classes+1]
         masks_queries_logits = outputs.masks_queries_logits  # [batch_size, num_queries, height, width]
 
+        device = masks_queries_logits.device
         batch_size = class_queries_logits.shape[0]
         num_queries = class_queries_logits.shape[1]
         num_classes = class_queries_logits.shape[-1] - 1
@@ -1177,7 +1178,7 @@ def post_process_instance_segmentation(
         for i in range(batch_size):
             # [Q, K]
             scores = torch.nn.functional.softmax(class_queries_logits[i], dim=-1)[:, :-1]
-            labels = torch.arange(num_classes).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
+            labels = torch.arange(num_classes, device=device).unsqueeze(0).repeat(num_queries, 1).flatten(0, 1)
 
             # scores_per_image, topk_indices = scores.flatten(0, 1).topk(self.num_queries, sorted=False)
             scores_per_image, topk_indices = scores.flatten(0, 1).topk(num_queries, sorted=False)
diff --git a/src/transformers/models/opt/modeling_opt.py b/src/transformers/models/opt/modeling_opt.py
index 5b56ff9e618ddf..9b22a51abc77aa 100644
--- a/src/transformers/models/opt/modeling_opt.py
+++ b/src/transformers/models/opt/modeling_opt.py
@@ -16,6 +16,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
@@ -33,12 +34,19 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_opt import OPTConfig
 
 
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "facebook/opt-350m"
@@ -64,6 +72,19 @@
 ]
 
 
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
 class OPTLearnedPositionalEmbedding(nn.Embedding):
     """
     This module learns positional embeddings up to a fixed maximum size.
@@ -93,30 +114,49 @@ class OPTAttention(nn.Module):
 
     def __init__(
         self,
-        embed_dim: int,
-        num_heads: int,
-        dropout: float = 0.0,
+        config: OPTConfig,
         is_decoder: bool = False,
-        bias: bool = True,
+        **kwargs,
     ):
         super().__init__()
-        self.embed_dim = embed_dim
-        self.num_heads = num_heads
-        self.dropout = dropout
-        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        def _handle_deprecated_argument(config_arg_name, config, fn_arg_name, kwargs):
+            """
+            If a the deprecated argument `fn_arg_name` is passed, raise a deprecation
+            warning and return that value, otherwise take the equivalent config.config_arg_name
+            """
+            val = None
+            if fn_arg_name in kwargs:
+                logging.warning(
+                    "Passing in {} to {self.__class__.__name__} is deprecated and won't be supported from v4.38."
+                    " Please set it in the config instead"
+                )
+                val = kwargs.pop(fn_arg_name)
+            else:
+                val = getattr(config, config_arg_name)
+            return val
 
-        if (self.head_dim * num_heads) != self.embed_dim:
+        self.embed_dim = _handle_deprecated_argument("hidden_size", config, "embed_dim", kwargs)
+        self.num_heads = _handle_deprecated_argument("num_attention_heads", config, "num_heads", kwargs)
+        self.dropout = _handle_deprecated_argument("attention_dropout", config, "dropout", kwargs)
+        self.enable_bias = _handle_deprecated_argument("enable_bias", config, "bias", kwargs)
+
+        self.head_dim = self.embed_dim // self.num_heads
+        self.is_causal = True
+
+        if (self.head_dim * self.num_heads) != self.embed_dim:
             raise ValueError(
                 f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
-                f" and `num_heads`: {num_heads})."
+                f" and `num_heads`: {self.num_heads})."
             )
         self.scaling = self.head_dim**-0.5
         self.is_decoder = is_decoder
 
-        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
-        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=self.enable_bias)
 
     def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
@@ -242,17 +282,228 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
+class OptFlashAttention2(OPTAttention):
+    """
+    OPT flash attention module. This module inherits from `OPTAttention` as the weights of the module stays untouched.
+    The only required change would be on the forward pass where it needs to correctly call the public API of flash
+    attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, _, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_length = query_states.shape[1]
+        tgt_len = key_states.shape[-2]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        query_states = query_states.view(bsz, query_length, self.num_heads, self.head_dim)
+        key_states = key_states.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+        value_states = value_states.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+
+        attn_dropout = self.dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in float16 just to be sure everything works as expected.
+        input_dtype = query_states.dtype
+        if input_dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, query_length, dropout=attn_dropout
+        )
+
+        attn_weights_reshaped = attn_output.reshape(bsz, query_length, self.num_heads * self.head_dim)
+        attn_output = self.out_proj(attn_weights_reshaped)
+
+        if not output_attentions:
+            attn_weights_reshaped = None
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+OPT_ATTENTION_CLASSES = {
+    "eager": OPTAttention,
+    "flash_attention_2": OptFlashAttention2,
+}
+
+
 class OPTDecoderLayer(nn.Module):
     def __init__(self, config: OPTConfig):
         super().__init__()
         self.embed_dim = config.hidden_size
-        self.self_attn = OPTAttention(
-            embed_dim=self.embed_dim,
-            num_heads=config.num_attention_heads,
-            dropout=config.attention_dropout,
-            is_decoder=True,
-            bias=config.enable_bias,
-        )
+
+        self.self_attn = OPT_ATTENTION_CLASSES[config._attn_implementation](config=config, is_decoder=True)
+
         self.do_layer_norm_before = config.do_layer_norm_before
         self.dropout = config.dropout
         self.activation_fn = ACT2FN[config.activation_function]
@@ -368,6 +619,7 @@ class OPTPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _no_split_modules = ["OPTDecoderLayer"]
+    _supports_flash_attn_2 = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -483,6 +735,7 @@ def __init__(self, config: OPTConfig):
             self.final_layer_norm = None
 
         self.layers = nn.ModuleList([OPTDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -581,16 +834,27 @@ def forward(
         mask_seq_length = past_key_values_length + seq_length
 
         # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
-        elif attention_mask.shape[1] != mask_seq_length:
-            raise ValueError(
-                f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
-                f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            causal_attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+            attention_mask = (
+                torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+                if attention_mask is None
+                else attention_mask
             )
-        causal_attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask, input_shape, inputs_embeds, past_key_values_length
-        )
+        else:
+            # 4d mask is passed through the layers
+            if attention_mask is None:
+                attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+            elif attention_mask.shape[1] != mask_seq_length:
+                raise ValueError(
+                    f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be "
+                    f"{mask_seq_length} (sum of the lengths of current and past inputs)"
+                )
+            causal_attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
+
         pos_embeds = self.embed_positions(attention_mask, past_key_values_length)
 
         if self.project_in is not None:
diff --git a/src/transformers/models/owlv2/modeling_owlv2.py b/src/transformers/models/owlv2/modeling_owlv2.py
index 9c7cede8fbf912..5146fbb89dcee6 100644
--- a/src/transformers/models/owlv2/modeling_owlv2.py
+++ b/src/transformers/models/owlv2/modeling_owlv2.py
@@ -1544,20 +1544,39 @@ def image_guided_detection(
         >>> import requests
         >>> from PIL import Image
         >>> import torch
+        >>> import numpy as np
         >>> from transformers import AutoProcessor, Owlv2ForObjectDetection
+        >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 
         >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
         >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
+
         >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
         >>> query_image = Image.open(requests.get(query_url, stream=True).raw)
         >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")
+
+        >>> # forward pass
         >>> with torch.no_grad():
         ...     outputs = model.image_guided_detection(**inputs)
-        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
-        >>> # Convert outputs (bounding boxes and class logits) to COCO API
+
+        >>> # Note: boxes need to be visualized on the padded, unnormalized image
+        >>> # hence we'll set the target image sizes (height, width) based on that
+
+        >>> def get_preprocessed_image(pixel_values):
+        ...     pixel_values = pixel_values.squeeze().numpy()
+        ...     unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
+        ...     unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
+        ...     unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
+        ...     unnormalized_image = Image.fromarray(unnormalized_image)
+        ...     return unnormalized_image
+
+        >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values)
+
+        >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
+
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> results = processor.post_process_image_guided_detection(
         ...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
         ... )
@@ -1566,19 +1585,19 @@ def image_guided_detection(
         >>> for box, score in zip(boxes, scores):
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
-        Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
-        Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
-        Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.79]
-        Detected similar object with confidence 0.985 at location [176.97, -29.45, 672.69, 182.83]
-        Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
-        Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
-        Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
-        Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
-        Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
-        Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
-        Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
-        Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
-        Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
+        Detected similar object with confidence 0.938 at location [490.96, 109.89, 821.09, 536.11]
+        Detected similar object with confidence 0.959 at location [8.67, 721.29, 928.68, 732.78]
+        Detected similar object with confidence 0.902 at location [4.27, 720.02, 941.45, 761.59]
+        Detected similar object with confidence 0.985 at location [265.46, -58.9, 1009.04, 365.66]
+        Detected similar object with confidence 1.0 at location [9.79, 28.69, 937.31, 941.64]
+        Detected similar object with confidence 0.998 at location [869.97, 58.28, 923.23, 978.1]
+        Detected similar object with confidence 0.985 at location [309.23, 21.07, 371.61, 932.02]
+        Detected similar object with confidence 0.947 at location [27.93, 859.45, 969.75, 915.44]
+        Detected similar object with confidence 0.996 at location [785.82, 41.38, 880.26, 966.37]
+        Detected similar object with confidence 0.998 at location [5.08, 721.17, 925.93, 998.41]
+        Detected similar object with confidence 0.969 at location [6.7, 898.1, 921.75, 949.51]
+        Detected similar object with confidence 0.966 at location [47.16, 927.29, 981.99, 942.14]
+        Detected similar object with confidence 0.924 at location [46.4, 936.13, 953.02, 950.78]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -1650,8 +1669,10 @@ def forward(
         ```python
         >>> import requests
         >>> from PIL import Image
+        >>> import numpy as np
         >>> import torch
         >>> from transformers import AutoProcessor, Owlv2ForObjectDetection
+        >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD
 
         >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
         >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")
@@ -1660,10 +1681,25 @@ def forward(
         >>> image = Image.open(requests.get(url, stream=True).raw)
         >>> texts = [["a photo of a cat", "a photo of a dog"]]
         >>> inputs = processor(text=texts, images=image, return_tensors="pt")
-        >>> outputs = model(**inputs)
 
-        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
-        >>> target_sizes = torch.Tensor([image.size[::-1]])
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # Note: boxes need to be visualized on the padded, unnormalized image
+        >>> # hence we'll set the target image sizes (height, width) based on that
+
+        >>> def get_preprocessed_image(pixel_values):
+        ...     pixel_values = pixel_values.squeeze().numpy()
+        ...     unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None]
+        ...     unnormalized_image = (unnormalized_image * 255).astype(np.uint8)
+        ...     unnormalized_image = np.moveaxis(unnormalized_image, 0, -1)
+        ...     unnormalized_image = Image.fromarray(unnormalized_image)
+        ...     return unnormalized_image
+
+        >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values)
+
+        >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]])
         >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores
         >>> results = processor.post_process_object_detection(
         ...     outputs=outputs, threshold=0.2, target_sizes=target_sizes
@@ -1676,8 +1712,8 @@ def forward(
         >>> for box, score, label in zip(boxes, scores, labels):
         ...     box = [round(i, 2) for i in box.tolist()]
         ...     print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}")
-        Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51]
-        Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]
+        Detected a photo of a cat with confidence 0.614 at location [512.5, 35.08, 963.48, 557.02]
+        Detected a photo of a cat with confidence 0.665 at location [10.13, 77.94, 489.93, 709.69]
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
diff --git a/src/transformers/models/owlvit/modeling_owlvit.py b/src/transformers/models/owlvit/modeling_owlvit.py
index 8c502c410d3b17..b8e8a36fec777c 100644
--- a/src/transformers/models/owlvit/modeling_owlvit.py
+++ b/src/transformers/models/owlvit/modeling_owlvit.py
@@ -1517,7 +1517,7 @@ def image_guided_detection(
         ...     outputs = model.image_guided_detection(**inputs)
         >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
         >>> target_sizes = torch.Tensor([image.size[::-1]])
-        >>> # Convert outputs (bounding boxes and class logits) to COCO API
+        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> results = processor.post_process_image_guided_detection(
         ...     outputs=outputs, threshold=0.6, nms_threshold=0.3, target_sizes=target_sizes
         ... )
diff --git a/src/transformers/models/patchtsmixer/__init__.py b/src/transformers/models/patchtsmixer/__init__.py
new file mode 100644
index 00000000000000..63f433791e1fe8
--- /dev/null
+++ b/src/transformers/models/patchtsmixer/__init__.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_patchtsmixer": [
+        "PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PatchTSMixerConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_patchtsmixer"] = [
+        "PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PatchTSMixerPreTrainedModel",
+        "PatchTSMixerModel",
+        "PatchTSMixerForPretraining",
+        "PatchTSMixerForPrediction",
+        "PatchTSMixerForTimeSeriesClassification",
+        "PatchTSMixerForRegression",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_patchtsmixer import (
+        PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        PatchTSMixerConfig,
+    )
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_patchtsmixer import (
+            PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSMixerForPrediction,
+            PatchTSMixerForPretraining,
+            PatchTSMixerForRegression,
+            PatchTSMixerForTimeSeriesClassification,
+            PatchTSMixerModel,
+            PatchTSMixerPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py
new file mode 100644
index 00000000000000..ae259e40306313
--- /dev/null
+++ b/src/transformers/models/patchtsmixer/configuration_patchtsmixer.py
@@ -0,0 +1,236 @@
+# coding=utf-8
+# Copyright 2023 IBM and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PatchTSMixer model configuration"""
+
+from typing import List, Optional, Union
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PATCHTSMIXER_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ibm/patchtsmixer-etth1-pretrain": "https://huggingface.co/ibm/patchtsmixer-etth1-pretrain/resolve/main/config.json",
+}
+
+
+class PatchTSMixerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PatchTSMixerModel`]. It is used to instantiate a
+    PatchTSMixer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the PatchTSMixer
+    [ibm/patchtsmixer-etth1-pretrain](https://huggingface.co/ibm/patchtsmixer-etth1-pretrain) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        context_length (`int`, *optional*, defaults to 32):
+            The context/history length for the input sequence.
+        patch_len (`int`, *optional*, defaults to 8):
+            The patch length for the input sequence.
+        num_input_channels (`int`, *optional*, defaults to 1):
+            Number of input variates. For Univariate, set it to 1.
+        patch_stride (`int`, *optional*, defaults to 8):
+            Determines the overlap between two consecutive patches. Set it to patch_length (or greater), if we want
+            non-overlapping patches.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples to generate in parallel for probabilistic forecast.
+        d_model (`int`, *optional*, defaults to 8):
+            Hidden dimension of the model. Recommended to set it as a multiple of patch_length (i.e. 2-5X of
+            patch_len). Larger value indicates more complex model.
+        expansion_factor (`int`, *optional*, defaults to 2):
+            Expansion factor to use inside MLP. Recommended range is 2-5. Larger value indicates more complex model.
+        num_layers (`int`, *optional*, defaults to 3):
+            Number of layers to use. Recommended range is 3-15. Larger value indicates more complex model.
+        dropout (`float`, *optional*, defaults to 0.2):
+            The dropout probability the `PatchTSMixer` backbone. Recommended range is 0.2-0.7
+        mode (`str`, *optional*, defaults to `"common_channel"`):
+            Mixer Mode. Determines how to process the channels. Allowed values: "common_channel", "mix_channel". In
+            "common_channel" mode, we follow Channel-independent modelling with no explicit channel-mixing. Channel
+            mixing happens in an implicit manner via shared weights across channels. (preferred first approach) In
+            "mix_channel" mode, we follow explicit channel-mixing in addition to patch and feature mixer. (preferred
+            approach when channel correlations are very important to model)
+        gated_attn (`bool`, *optional*, defaults to `True`):
+            Enable Gated Attention.
+        norm_mlp (`str`, *optional*, defaults to `"LayerNorm"`):
+            Normalization layer (BatchNorm or LayerNorm).
+        self_attn (`bool`, *optional*, defaults to `False`):
+            Enable Tiny self attention across patches. This can be enabled when the output of Vanilla PatchTSMixer with
+            gated attention is not satisfactory. Enabling this leads to explicit pair-wise attention and modelling
+            across patches.
+        self_attn_heads (`int`, *optional*, defaults to 1):
+            Number of self-attention heads. Works only when `self_attn` is set to `True`.
+        use_positional_encoding (`bool`, *optional*, defaults to `False`):
+            Enable the use of positional embedding for the tiny self-attention layers. Works only when `self_attn` is
+            set to `True`.
+        positional_encoding_type (`str`, *optional*, defaults to `"sincos"`):
+            Positional encodings. Options `"random"` and `"sincos"` are supported. Works only when
+            `use_positional_encoding` is set to `True`
+        scaling (`string` or `bool`, *optional*, defaults to `"std"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
+        loss (`string`, *optional*, defaults to `"mse"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
+            error "mse".
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        post_init (`bool`, *optional*, defaults to `False`):
+            Whether to use custom weight initialization from `transformers` library, or the default initialization in
+            `PyTorch`. Setting it to `False` performs `PyTorch` weight initialization.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            A value added to the denominator for numerical stability of normalization.
+        mask_type (`str`, *optional*, defaults to `"random"`):
+            Type of masking to use for Masked Pretraining mode. Allowed values are "random", "forecast". In Random
+            masking, points are masked randomly. In Forecast masking, points are masked towards the end.
+        random_mask_ratio (`float`, *optional*, defaults to 0.5):
+            Masking ratio to use when `mask_type` is `random`. Higher value indicates more masking.
+        num_forecast_mask_patches (`int` or `list`, *optional*, defaults to `[2]`):
+            Number of patches to be masked at the end of each batch sample. If it is an integer, all the samples in the
+            batch will have the same number of masked patches. If it is a list, samples in the batch will be randomly
+            masked by numbers defined in the list. This argument is only used for forecast pretraining.
+        mask_value (`float`, *optional*, defaults to `0.0`):
+            Mask value to use.
+        masked_loss (`bool`, *optional*, defaults to `True`):
+            Whether to compute pretraining loss only at the masked portions, or on the entire output.
+        channel_consistent_masking (`bool`, *optional*, defaults to `True`):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels.
+        unmasked_channel_indices (`list`, *optional*):
+            Channels that are not masked during pretraining.
+        head_dropout (`float`, *optional*, defaults to 0.2):
+            The dropout probability the `PatchTSMixer` head.
+        distribution_output (`string`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
+            "negative_binomial".
+        prediction_length (`int`, *optional*, defaults to 16):
+            Number of time steps to forecast for a forecasting task. Also known as the Forecast Horizon.
+        prediction_channel_indices (`list`, *optional*):
+            List of channel indices to forecast. If None, forecast all channels. Target data is expected to have all
+            channels and we explicitly filter the channels in prediction and target before loss computation.
+        num_targets (`int`, *optional*, defaults to 3):
+            Number of targets (dimensionality of the regressed variable) for a regression task.
+        output_range (`list`, *optional*):
+            Output range to restrict for the regression task. Defaults to None.
+        head_aggregation (`str`, *optional*, defaults to `"max_pool"`):
+            Aggregation mode to enable for classification or regression task. Allowed values are `None`, "use_last",
+            "max_pool", "avg_pool".
+
+    Example:
+
+    ```python
+    >>> from transformers import PatchTSMixerConfig, PatchTSMixerModel
+
+    >>> # Initializing a default PatchTSMixer configuration
+    >>> configuration = PatchTSMixerConfig()
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = PatchTSMixerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "patchtsmixer"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        # Time series specific configuration
+        context_length: int = 32,
+        patch_len: int = 8,
+        num_input_channels: int = 1,
+        patch_stride: int = 8,
+        num_parallel_samples: int = 100,
+        # General model configuration
+        d_model: int = 8,
+        expansion_factor: int = 2,
+        num_layers: int = 3,
+        dropout: float = 0.2,
+        mode: str = "common_channel",
+        gated_attn: bool = True,
+        norm_mlp: str = "LayerNorm",
+        self_attn: bool = False,
+        self_attn_heads: int = 1,
+        use_positional_encoding: bool = False,
+        positional_encoding_type: str = "sincos",
+        scaling: Optional[Union[str, bool]] = "std",
+        loss: str = "mse",
+        init_std: float = 0.02,
+        post_init: bool = False,
+        norm_eps: float = 1e-5,
+        # Pretrain model configuration
+        mask_type: str = "random",
+        random_mask_ratio: float = 0.5,
+        num_forecast_mask_patches: Optional[Union[List[int], int]] = [2],
+        mask_value: int = 0,
+        masked_loss: bool = True,
+        channel_consistent_masking: bool = True,
+        unmasked_channel_indices: Optional[List[int]] = None,
+        # General head configuration
+        head_dropout: float = 0.2,
+        distribution_output: str = "student_t",
+        # Prediction head configuration
+        prediction_length: int = 16,
+        prediction_channel_indices: list = None,
+        # Classification/Regression configuration
+        num_targets: int = 3,
+        output_range: list = None,
+        head_aggregation: str = "max_pool",
+        **kwargs,
+    ):
+        self.num_input_channels = num_input_channels
+        self.context_length = context_length
+        self.patch_length = patch_len
+        self.patch_stride = patch_stride
+        self.d_model = d_model
+        self.expansion_factor = expansion_factor
+        self.num_layers = num_layers
+        self.dropout = dropout
+        self.mode = mode
+        self.gated_attn = gated_attn
+        self.norm_mlp = norm_mlp
+        self.scaling = scaling
+        self.head_dropout = head_dropout
+        self.num_patches = (max(context_length, patch_len) - patch_len) // patch_stride + 1
+        self.mask_type = mask_type
+        self.random_mask_ratio = random_mask_ratio
+        self.num_forecast_mask_patches = num_forecast_mask_patches
+        self.mask_value = mask_value
+        self.channel_consistent_masking = channel_consistent_masking
+        self.masked_loss = masked_loss
+        self.patch_last = True
+        self.use_positional_encoding = use_positional_encoding
+        self.positional_encoding_type = positional_encoding_type
+        self.prediction_length = prediction_length
+        self.prediction_channel_indices = prediction_channel_indices
+        self.num_targets = num_targets
+        self.output_range = output_range
+        self.head_aggregation = head_aggregation
+        self.self_attn = self_attn
+        self.self_attn_heads = self_attn_heads
+        self.init_std = init_std
+        self.post_init = post_init
+        self.distribution_output = distribution_output
+        self.loss = loss
+        self.num_parallel_samples = num_parallel_samples
+        self.unmasked_channel_indices = unmasked_channel_indices
+        self.norm_eps = norm_eps
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
new file mode 100644
index 00000000000000..2a2bd2a3db6e7e
--- /dev/null
+++ b/src/transformers/models/patchtsmixer/modeling_patchtsmixer.py
@@ -0,0 +1,2174 @@
+# coding=utf-8
+# Copyright 2023 IBM and HuggingFace Inc. team. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PatchTSMixer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput
+
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_patchtsmixer import PatchTSMixerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PatchTSMixerConfig"
+
+
+PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ibm/patchtsmixer-etth1-pretrain",
+    # See all PatchTSMixer models at https://huggingface.co/models?filter=patchtsmixer
+]
+
+
+PATCHTSMIXER_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PatchTSMixerConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        mask_input (`bool`, *optional*, defaults to `False`):
+            If True, Masking will be enabled. False otherwise.
+"""
+
+PATCHTSMIXER_INPUTS_DOCSTRING = r"""
+    Args:
+        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
+            Context values of the time series. For a pretraining task, this denotes the input time series to predict
+            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
+            for classification or regression tasks, it denotes the appropriate context values of the time series.
+
+            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
+            greater than 1.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class PatchTSMixerGatedAttention(nn.Module):
+    """
+    Module that applies gated attention to input data.
+
+    Args:
+        in_size (`int`): The input size.
+        out_size (`int`): The output size.
+    """
+
+    def __init__(self, in_size: int, out_size: int):
+        super().__init__()
+        self.attn_layer = nn.Linear(in_size, out_size)
+        self.attn_softmax = nn.Softmax(dim=-1)
+
+    def forward(self, inputs):
+        attn_weight = self.attn_softmax(self.attn_layer(inputs))
+        inputs = inputs * attn_weight
+        return inputs
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTBatchNorm with PatchTST->PatchTSMixer
+class PatchTSMixerBatchNorm(nn.Module):
+    """
+    Compute batch normalization over the sequence length (time) dimension.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        self.batchnorm = nn.BatchNorm1d(config.d_model, eps=config.norm_eps)
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Parameters:
+            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
+                input for Batch norm calculation
+        Returns:
+            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
+        """
+        output = inputs.transpose(1, 2)  # output: (batch_size, d_model, sequence_length)
+        output = self.batchnorm(output)
+        return output.transpose(1, 2)
+
+
+class PatchTSMixerPositionalEncoding(nn.Module):
+    """
+    Class for positional encoding
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        # positional encoding: [num_patches x d_model]
+        if config.use_positional_encoding:
+            self.position_enc = self._init_pe(config)
+        else:
+            self.position_enc = nn.Parameter(torch.zeros(config.num_patches, config.d_model))
+
+    @staticmethod
+    def _init_pe(config: PatchTSMixerConfig) -> nn.Parameter:
+        # Positional encoding
+        if config.positional_encoding_type == "random":
+            position_enc = nn.Parameter(torch.randn(config.num_patches, config.d_model), requires_grad=True)
+        elif config.positional_encoding_type == "sincos":
+            position_enc = torch.zeros(config.num_patches, config.d_model)
+            position = torch.arange(0, config.num_patches).unsqueeze(1)
+            div_term = torch.exp(torch.arange(0, config.d_model, 2) * -(math.log(10000.0) / config.d_model))
+            position_enc[:, 0::2] = torch.sin(position * div_term)
+            position_enc[:, 1::2] = torch.cos(position * div_term)
+            position_enc = position_enc - position_enc.mean()
+            position_enc = position_enc / (position_enc.std() * 10)
+            position_enc = nn.Parameter(position_enc, requires_grad=False)
+        else:
+            raise ValueError(
+                f"{config.positional_encoding_type} is not a valid positional encoder. Available types are 'random' and 'sincos'."
+            )
+        return position_enc
+
+    def forward(self, patch_input: torch.Tensor):
+        # hidden_state: [bs x num_channels x num_patches x d_model]
+        hidden_state = patch_input + self.position_enc
+        return hidden_state
+
+
+class PatchTSMixerNormLayer(nn.Module):
+    """Normalization block
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.norm_mlp = config.norm_mlp
+
+        if "batch" in config.norm_mlp.lower():
+            self.norm = PatchTSMixerBatchNorm(config)
+        else:
+            self.norm = nn.LayerNorm(config.d_model, eps=config.norm_eps)
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Args:
+            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
+                Input to the normalization layer.
+        Returns:
+            `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
+        """
+        if "batch" in self.norm_mlp.lower():
+            # reshape the data
+            inputs_reshaped = torch.reshape(
+                inputs,
+                (
+                    inputs.shape[0] * inputs.shape[1],
+                    inputs.shape[2],
+                    inputs.shape[3],
+                ),
+            )  # inputs_reshaped: [batch_size*num_channels, num_patches, d_model]
+
+            # inputs_reshaped: [batch_size*num_channels, num_patches, d_model]
+            inputs_reshaped = self.norm(inputs_reshaped)
+
+            # put back data to the original shape
+            inputs = torch.reshape(inputs_reshaped, inputs.shape)
+
+        else:
+            inputs = self.norm(inputs)
+
+        return inputs
+
+
+class PatchTSMixerMLP(nn.Module):
+    def __init__(self, in_features, out_features, config):
+        super().__init__()
+        num_hidden = in_features * config.expansion_factor
+        self.fc1 = nn.Linear(in_features, num_hidden)
+        self.dropout1 = nn.Dropout(config.dropout)
+        self.fc2 = nn.Linear(num_hidden, out_features)
+        self.dropout2 = nn.Dropout(config.dropout)
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Args:
+            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
+                Input to the MLP layer.
+        Returns:
+            `torch.Tensor` of the same shape as `inputs`
+        """
+        inputs = self.dropout1(nn.functional.gelu(self.fc1(inputs)))
+        inputs = self.fc2(inputs)
+        inputs = self.dropout2(inputs)
+        return inputs
+
+
+class PatchTSMixerChannelFeatureMixerBlock(nn.Module):
+    """This module mixes the features in the channel dimension.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.norm = PatchTSMixerNormLayer(config)
+        self.gated_attn = config.gated_attn
+        self.mlp = PatchTSMixerMLP(
+            in_features=config.num_input_channels,
+            out_features=config.num_input_channels,
+            config=config,
+        )
+
+        if config.gated_attn:
+            self.gating_block = PatchTSMixerGatedAttention(
+                in_size=config.num_input_channels, out_size=config.num_input_channels
+            )
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Args:
+            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
+                input to the MLP layer
+        Returns:
+            `torch.Tensor` of the same shape as `inputs`
+        """
+        residual = inputs
+        inputs = self.norm(inputs)
+
+        inputs = inputs.permute(0, 3, 2, 1)
+
+        if self.gated_attn:
+            inputs = self.gating_block(inputs)
+
+        inputs = self.mlp(inputs)
+
+        inputs = inputs.permute(0, 3, 2, 1)
+
+        out = inputs + residual
+        return out
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTSMixer
+class PatchTSMixerAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[PatchTSMixerConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class PatchMixerBlock(nn.Module):
+    """This module mixes the patch dimension.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.norm = PatchTSMixerNormLayer(config)
+
+        self.self_attn = config.self_attn
+        self.gated_attn = config.gated_attn
+
+        self.mlp = PatchTSMixerMLP(
+            in_features=config.num_patches,
+            out_features=config.num_patches,
+            config=config,
+        )
+
+        if config.gated_attn:
+            self.gating_block = PatchTSMixerGatedAttention(in_size=config.num_patches, out_size=config.num_patches)
+
+        if config.self_attn:
+            self.self_attn_layer = PatchTSMixerAttention(
+                embed_dim=config.d_model,
+                num_heads=config.self_attn_heads,
+                dropout=config.dropout,
+            )
+            self.norm_attn = PatchTSMixerNormLayer(config)
+
+    def forward(self, hidden_state):
+        """
+        Args:
+            hidden_state (`torch.Tensor`): Input tensor.
+
+        Returns:
+            `torch.Tensor`: Transformed tensor.
+        """
+        residual = hidden_state
+
+        hidden_state = self.norm(hidden_state)
+
+        if self.self_attn:
+            batch_size, n_vars, num_patches, d_model = hidden_state.shape
+            hidden_state_reshaped = hidden_state.reshape(batch_size * n_vars, num_patches, d_model)
+
+            x_attn, _, _ = self.self_attn_layer(hidden_state_reshaped, output_attentions=False)
+            x_attn = x_attn.reshape(batch_size, n_vars, num_patches, d_model)
+
+        # Transpose so that num_patches is the last dimension
+        hidden_state = hidden_state.transpose(2, 3)
+        hidden_state = self.mlp(hidden_state)
+
+        if self.gated_attn:
+            hidden_state = self.gating_block(hidden_state)
+
+        # Transpose back
+        hidden_state = hidden_state.transpose(2, 3)
+
+        if self.self_attn:
+            hidden_state = self.norm_attn(hidden_state + x_attn)
+
+        out = hidden_state + residual
+        return out
+
+
+class FeatureMixerBlock(nn.Module):
+    """This module mixes the hidden feature dimension.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.norm = PatchTSMixerNormLayer(config)
+
+        self.gated_attn = config.gated_attn
+
+        self.mlp = PatchTSMixerMLP(
+            in_features=config.d_model,
+            out_features=config.d_model,
+            config=config,
+        )
+
+        if config.gated_attn:
+            self.gating_block = PatchTSMixerGatedAttention(in_size=config.d_model, out_size=config.d_model)
+
+    def forward(self, hidden: torch.Tensor):
+        """
+        Args:
+            hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
+                Input tensor to the layer.
+
+        Returns:
+            `torch.Tensor`: Transformed tensor.
+        """
+        residual = hidden
+        hidden = self.norm(hidden)
+        hidden = self.mlp(hidden)
+
+        if self.gated_attn:
+            hidden = self.gating_block(hidden)
+
+        out = hidden + residual
+        return out
+
+
+class PatchTSMixerLayer(nn.Module):
+    """
+    The `PatchTSMixer` layer that does all three kinds of mixing.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.patch_mixer = PatchMixerBlock(config=config)
+        self.feature_mixer = FeatureMixerBlock(config=config)
+
+        self.mode = config.mode
+
+        if config.mode == "mix_channel":
+            self.channel_feature_mixer = PatchTSMixerChannelFeatureMixerBlock(config=config)
+
+    def forward(self, hidden: torch.Tensor):
+        """
+        Args:
+            hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
+                Input tensor to the layer.
+
+        Returns:
+            `torch.Tensor`: Transformed tensor.
+        """
+        if self.mode == "mix_channel":
+            hidden = self.channel_feature_mixer(hidden)
+
+        hidden = self.patch_mixer(hidden)
+        hidden = self.feature_mixer(hidden)  # hidden: (batch_size x num_patches x d_model)
+        return hidden
+
+
+class PatchTSMixerBlock(nn.Module):
+    """The main computing framework of the `PatchTSMixer` model.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        num_layers = config.num_layers
+
+        self.mixers = nn.ModuleList([PatchTSMixerLayer(config=config) for _ in range(num_layers)])
+
+    def forward(self, hidden_state, output_hidden_states: bool = False):
+        """
+        Args:
+            hidden_state (`torch.Tensor`): The input tensor.
+            output_hidden_states (`bool`, *optional*, defaults to False.):
+                Whether to output the hidden states as well.
+
+        Returns:
+            `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
+            `True`.
+        """
+        all_hidden_states = []
+
+        embedding = hidden_state
+
+        for mod in self.mixers:
+            embedding = mod(embedding)
+            if output_hidden_states:
+                all_hidden_states.append(embedding)
+
+        if output_hidden_states:
+            return embedding, all_hidden_states
+        else:
+            return embedding, None
+
+
+class PatchTSMixerForPredictionHead(nn.Module):
+    """Prediction Head for Forecasting
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*): Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
+        super().__init__()
+
+        self.prediction_channel_indices = config.prediction_channel_indices
+
+        if self.prediction_channel_indices is not None:
+            self.prediction_channel_indices.sort()
+
+        self.dropout_layer = nn.Dropout(config.head_dropout)
+        if distribution_output is None:
+            self.base_forecast_block = nn.Linear((config.num_patches * config.d_model), config.prediction_length)
+        else:
+            self.base_forecast_block = distribution_output.get_parameter_projection(
+                config.num_patches * config.d_model
+            )
+
+        self.flatten = nn.Flatten(start_dim=-2)
+
+    def forward(self, hidden_features):
+        """
+
+        Args:
+            hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
+                or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
+                features.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.
+
+        """
+
+        hidden_features = self.flatten(hidden_features)  # [batch_size x n_vars x num_patch * d_model]
+        hidden_features = self.dropout_layer(hidden_features)  # [batch_size x n_vars x num_patch * d_model]
+        forecast = self.base_forecast_block(hidden_features)  # [batch_size x n_vars x prediction_length]
+        if isinstance(forecast, tuple):
+            forecast = tuple(z.transpose(-1, -2) for z in forecast)
+        else:
+            forecast = forecast.transpose(-1, -2)  # [batch_size x prediction_length x n_vars]
+
+        if self.prediction_channel_indices is not None:
+            if isinstance(forecast, tuple):
+                forecast = tuple(z[..., self.prediction_channel_indices] for z in forecast)
+            else:
+                forecast = forecast[..., self.prediction_channel_indices]  # [batch_size x prediction_length x n_vars]
+
+        return forecast
+
+
+class PatchTSMixerLinearHead(nn.Module):
+    """Linear head for Classification and Regression.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+
+    """
+
+    def __init__(self, config: PatchTSMixerConfig, distribution_output=None):
+        super().__init__()
+
+        self.head_aggregation = config.head_aggregation
+        self.output_range = config.output_range
+
+        if config.head_aggregation is None:
+            mul_factor = config.num_patches
+        else:
+            mul_factor = 1
+        self.distribution_output = distribution_output
+        if distribution_output is None:
+            self.projection = nn.Linear(
+                config.d_model * config.num_input_channels * mul_factor,
+                config.num_targets,
+            )
+        else:
+            self.projection = distribution_output.get_parameter_projection(
+                config.d_model * config.num_input_channels * mul_factor
+            )
+
+        if config.head_aggregation is None:
+            self.flatten = nn.Flatten(start_dim=-3)
+        else:
+            self.flatten = nn.Flatten(start_dim=-2)
+
+        self.dropout = nn.Dropout(config.head_dropout)
+
+    def forward(self, hidden_features):
+        """
+        Args:
+            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
+                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
+                features.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size x num_targets)`.
+        """
+
+        # batch_size x d_model x num_patch or batch_size x n_vars x d_model x num_patch
+        hidden_features = hidden_features.transpose(-1, -2)
+        if self.head_aggregation == "use_last":
+            # batch_size x d_model (flatten) or # batch_size x n_vars x d_model (common_channel)
+            hidden_features = hidden_features[..., -1]
+        elif self.head_aggregation == "max_pool":
+            # batch_size x n_vars x d_model or batch_size x d_model
+            hidden_features = hidden_features.max(dim=-1).values
+        elif self.head_aggregation == "avg_pool":
+            # batch_size x n_vars x d_model or batch_size x d_model
+            hidden_features = hidden_features.mean(dim=-1)
+
+        if self.flatten:
+            hidden_features = self.flatten(hidden_features)
+        hidden_features = self.dropout(hidden_features)
+        hidden_features = self.projection(hidden_features)  # batch_size x num_targets
+
+        if (self.distribution_output is None) and (self.output_range is not None):
+            hidden_features = (
+                torch.sigmoid(hidden_features) * (self.output_range[1] - self.output_range[0]) + self.output_range[0]
+            )
+        return hidden_features
+
+
+class PatchTSMixerPreTrainedModel(PreTrainedModel):
+    # Weight initialization
+    config_class = PatchTSMixerConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """Initialize weights"""
+        if isinstance(module, PatchTSMixerPositionalEncoding):
+            # initialize positional encoding
+            if self.config.positional_encoding_type == "random":
+                nn.init.normal_(module.position_enc, mean=0.0, std=0.1)
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, PatchTSMixerBatchNorm):
+            module.batchnorm.bias.data.zero_()
+            module.batchnorm.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+class PatchTSMixerPretrainHead(nn.Module):
+    """Pretraining head.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.dropout_layer = nn.Dropout(config.head_dropout)
+        self.base_pt_block = nn.Linear(config.d_model, config.patch_length)
+
+    def forward(self, hidden_features):
+        """
+        Args:
+            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
+                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
+                features.
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
+        """
+
+        hidden_features = self.dropout_layer(hidden_features)
+        forecast = self.base_pt_block(hidden_features)  # [batch_size x n_vars x num_patch x patch_length]
+        return forecast
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.random_masking
+def random_masking(
+    inputs: torch.Tensor,
+    mask_ratio: float,
+    unmasked_channel_indices: list = None,
+    channel_consistent_masking: bool = False,
+    mask_value: int = 0,
+):
+    """random_masking: Mask the input considering the control variables.
+
+    Args:
+        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
+            The input tensor to mask.
+        mask_ratio (`float`):
+            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
+        unmasked_channel_indices (list, *optional*):
+            Indices of channels that will not be masked.
+        channel_consistent_masking (bool, *optional*, defaults to `False`):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels.
+        mask_value (int, *optional*, defaults to 0):
+            Define the value of masked patches for pretraining.
+
+    Returns:
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
+        n]
+    """
+    if mask_ratio < 0 or mask_ratio >= 1:
+        raise ValueError(f"Mask ratio {mask_ratio} has to be between 0 and 1.")
+
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
+    device = inputs.device
+
+    len_keep = int(sequence_length * (1 - mask_ratio))
+
+    if channel_consistent_masking:
+        noise = torch.rand(batch_size, 1, sequence_length, device=device)  # noise in [0, 1], bs x 1 x  L
+        noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
+    else:
+        # noise in [0, 1], bs x num_channels x L
+        noise = torch.rand(batch_size, num_channels, sequence_length, device=device)
+
+    # mask: [bs x num_channels x num_patch]
+    mask = torch.ones(batch_size, num_channels, sequence_length, device=device)
+    mask[:, :, :len_keep] = 0
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x num_channels x L]
+
+    mask = torch.gather(mask, dim=-1, index=ids_restore)
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patches x patch_length]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.forecast_masking
+def forecast_masking(
+    inputs: torch.Tensor,
+    num_forecast_mask_patches: Union[list, int],
+    unmasked_channel_indices: list = None,
+    mask_value: int = 0,
+):
+    """Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
+    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.
+
+    Parameters:
+        inputs (`torch.Tensor`):
+            Input of shape `(bs, num_channels, num_patch, patch_len)`
+        num_forecast_mask_patches (`list`):
+            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
+        unmasked_channel_indices (`list`, *optional*):
+            Indices of channels that are not masked.
+        mask_value (`int`, *optional*, defaults to 0):
+            Values in the masked patches will be filled by `mask_value`.
+
+    Returns:
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
+        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
+    """
+
+    if isinstance(num_forecast_mask_patches, int):
+        num_forecast_mask_patches = [num_forecast_mask_patches]
+    forecast_mask_ratios = [1 for _ in num_forecast_mask_patches]
+
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
+    mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
+
+    t_list = []
+    total_length = 0
+    total_ratio = sum(forecast_mask_ratios)
+
+    for patch_length, ratio in zip(num_forecast_mask_patches, forecast_mask_ratios):
+        if patch_length <= 0 or patch_length >= sequence_length:
+            raise ValueError(
+                f"num_forecast_mask_patches {patch_length} should be greater than 0 and less than total patches."
+            )
+        temp_len = int(batch_size * ratio / total_ratio)
+        t_list.append([patch_length, ratio, temp_len])
+        total_length += temp_len
+
+    t_list = sorted(t_list, key=lambda x: x[2])
+
+    if total_length < batch_size:
+        t_list[0][2] = t_list[0][2] + (batch_size - total_length)
+    elif total_length > batch_size:
+        t_list[-1][2] = t_list[-1][2] + (total_length - batch_size)
+
+    batch1 = 0
+    for patch_len, _, temp_len in t_list:
+        batch2 = batch1 + temp_len
+        mask[batch1:batch2, :, -patch_len:] = 1
+        batch1 = batch2
+
+    perm = torch.randperm(mask.shape[0])
+    mask = mask[perm]
+
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patch x patch_len]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTPatchify with PatchTST->PatchTSMixer
+class PatchTSMixerPatchify(nn.Module):
+    """
+    A class to patchify the time series sequence into different patches
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+
+        self.sequence_length = config.context_length
+        self.patch_length = config.patch_length
+        self.patch_stride = config.patch_stride
+
+        if self.sequence_length <= self.patch_length:
+            raise ValueError(
+                f"Sequence length ({self.sequence_length}) has to be greater than the patch length ({self.patch_length})"
+            )
+
+        # get the number of patches
+        self.num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
+        new_sequence_length = self.patch_length + self.patch_stride * (self.num_patches - 1)
+        self.sequence_start = self.sequence_length - new_sequence_length
+
+    def forward(self, past_values: torch.Tensor):
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
+                Input for patchification
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
+        """
+        sequence_length = past_values.shape[-2]
+        if sequence_length != self.sequence_length:
+            raise ValueError(
+                f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+            )
+        # output: [bs x new_sequence_length x num_channels]
+        output = past_values[:, self.sequence_start :, :]
+        # output: [bs x num_patches x num_input_channels x patch_length]
+        output = output.unfold(dimension=-2, size=self.patch_length, step=self.patch_stride)
+        # output: [bs x num_input_channels x num_patches x patch_length]
+        output = output.transpose(-2, -3).contiguous()
+        return output
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTMasking with PatchTST->PatchTSMixer
+class PatchTSMixerMasking(nn.Module):
+    """
+    Class to perform random or forecast masking.
+
+    Parameters:
+        config (`PatchTSMixerConfig`): model config
+    Returns:
+        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
+            Masked patched input
+        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
+            Bool tensor indicating True on masked points
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        self.random_mask_ratio = config.random_mask_ratio
+        self.channel_consistent_masking = config.channel_consistent_masking
+        self.mask_type = config.mask_type
+        self.num_forecast_mask_patches = config.num_forecast_mask_patches
+        self.unmasked_channel_indices = config.unmasked_channel_indices
+        self.mask_value = config.mask_value
+        if self.unmasked_channel_indices is not None:
+            self.unmasked_channel_indices = sorted(self.unmasked_channel_indices)
+
+    def forward(self, patch_input: torch.Tensor):
+        """
+        Parameters:
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Patch input
+
+        Return:
+            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
+                Masked patched input
+            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
+                Bool tensor indicating True on masked points
+
+        """
+        if self.mask_type == "random":
+            masked_input, mask = random_masking(
+                inputs=patch_input,
+                mask_ratio=self.random_mask_ratio,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                channel_consistent_masking=self.channel_consistent_masking,
+                mask_value=self.mask_value,
+            )
+        elif self.mask_type == "forecast":
+            masked_input, mask = forecast_masking(
+                inputs=patch_input,
+                num_forecast_mask_patches=self.num_forecast_mask_patches,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                mask_value=self.mask_value,
+            )
+        else:
+            raise ValueError(f"Invalid mask type {self.mask_type}.")
+
+        # mask: [bs x num_input_channels x num_patch]
+        mask = mask.bool()
+        return masked_input, mask
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTStdScaler with PatchTST->PatchTSMixer
+class PatchTSMixerStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTMeanScaler with PatchTST->PatchTSMixer
+class PatchTSMixerMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
+    accordingly.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.patchtst.modeling_patchtst.PatchTSTNOPScaler with PatchTST->PatchTSMixer
+class PatchTSMixerNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+@dataclass
+class PatchTSMixerEncoderOutput(ModelOutput):
+    """
+    Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
+            Hidden-state at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PatchTSMixerEncoder(PatchTSMixerPreTrainedModel):
+    """
+    Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__(config)
+
+        self.use_return_dict = config.use_return_dict
+
+        self.patcher = nn.Linear(config.patch_length, config.d_model)
+        if config.use_positional_encoding:
+            self.positional_encoder = PatchTSMixerPositionalEncoding(config=config)
+        else:
+            self.positional_encoder = None
+        self.mlp_mixer_encoder = PatchTSMixerBlock(config=config)
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @replace_return_docstrings(output_type=PatchTSMixerEncoderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSMixerEncoderOutput]:
+        r"""
+        Args:
+            past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
+                Context values of the time series. For a pretraining task, this denotes the input time series to
+                predict the masked portion. For a forecasting task, this denotes the history/past time series values.
+                Similarly, for classification or regression tasks, it denotes the appropriate context values of the
+                time series.
+
+                For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
+                it is greater than 1.
+
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers.
+
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        # flatten [bs x num_patch x d_model]. common_channel/mix_channel: [bs x n_vars x num_patch x d_model]
+        patches = self.patcher(past_values)
+
+        # add positional encoder
+        if self.positional_encoder is not None:
+            patches = self.positional_encoder(patches)
+
+        last_hidden_state, hidden_states = self.mlp_mixer_encoder(patches, output_hidden_states=output_hidden_states)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    hidden_states,
+                ]
+            )
+
+        return PatchTSMixerEncoderOutput(last_hidden_state=last_hidden_state, hidden_states=hidden_states)
+
+
+@dataclass
+class PatchTSMixerModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
+            Hidden-state at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer.
+        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
+            Patched input data to the model.
+        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*):
+            Bool Tensor indicating True in masked patches and False otherwise.
+        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
+            Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
+            enabled.
+        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
+            Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
+            enabled.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    patch_input: torch.FloatTensor = None
+    mask: Optional[torch.FloatTensor] = None
+    loc: Optional[torch.FloatTensor] = None
+    scale: Optional[torch.FloatTensor] = None
+
+
+@add_start_docstrings(
+    "The PatchTSMixer Model for time-series forecasting.",
+    PATCHTSMIXER_START_DOCSTRING,
+)
+class PatchTSMixerModel(PatchTSMixerPreTrainedModel):
+    def __init__(self, config: PatchTSMixerConfig, mask_input: bool = False):
+        super().__init__(config)
+
+        self.use_return_dict = config.use_return_dict
+        self.encoder = PatchTSMixerEncoder(config)
+        self.patching = PatchTSMixerPatchify(config)
+
+        if mask_input is True:
+            self.masking = PatchTSMixerMasking(config)
+        else:
+            self.masking = None
+
+        if config.scaling == "mean":
+            self.scaler = PatchTSMixerMeanScaler(config)
+        elif config.scaling == "std" or config.scaling is True:
+            self.scaler = PatchTSMixerStdScaler(config)
+        else:
+            self.scaler = PatchTSMixerNOPScaler(config)
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @add_start_docstrings_to_model_forward(PATCHTSMIXER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PatchTSMixerModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        observed_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> PatchTSMixerModelOutput:
+        r"""
+        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        mask = None
+        if observed_mask is None:
+            observed_mask = torch.ones_like(past_values)
+        scaled_past_values, loc, scale = self.scaler(past_values, observed_mask)
+
+        patched_x = self.patching(scaled_past_values)  # [batch_size x num_input_channels x num_patch x patch_length
+
+        enc_input = patched_x
+        if self.masking is not None:
+            enc_input, mask = self.masking(patched_x)
+            # enc_input: [batch_size x num_input_channels x num_patch x patch_length]
+            # mask: [batch_size x num_input_channels x num_patch]
+
+        encoder_output = self.encoder(
+            enc_input,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(encoder_output, tuple):
+            encoder_output = PatchTSMixerEncoderOutput(*encoder_output)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    encoder_output.last_hidden_state,
+                    encoder_output.hidden_states,
+                    patched_x,
+                    mask,
+                    loc,
+                    scale,
+                ]
+            )
+
+        return PatchTSMixerModelOutput(
+            last_hidden_state=encoder_output.last_hidden_state,
+            hidden_states=encoder_output.hidden_states,
+            patch_input=patched_x,
+            mask=mask,
+            loc=loc,
+            scale=scale,
+        )
+
+
+@dataclass
+class PatchTSMixerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`PatchTSMixerForPreTrainingOutput`].
+
+    Args:
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
+            Prediction output from the pretrain head.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
+            Backbone embeddings before passing through the head.
+        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
+            Total loss
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_outputs: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PatchTSMixerForPretraining(PatchTSMixerPreTrainedModel):
+    r"""
+    `PatchTSMixer` for mask pretraining.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    Returns:
+        `None`.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__(config)
+        self.model = PatchTSMixerModel(config, mask_input=True)
+        self.head = PatchTSMixerPretrainHead(config=config)
+        self.masked_loss = config.masked_loss
+        self.use_return_dict = config.use_return_dict
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @add_start_docstrings_to_model_forward(PATCHTSMIXER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PatchTSMixerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        observed_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = False,
+        return_loss: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> PatchTSMixerForPreTrainingOutput:
+        r"""
+        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        return_loss (`bool`,  *optional*):
+            Whether to return the loss in the `forward` call.
+
+        Returns:
+
+        """
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        if self.masked_loss is True:
+            loss = torch.nn.MSELoss(reduction="none")
+        else:
+            loss = torch.nn.MSELoss(reduction="mean")
+
+        # past_values: tensor [batch_size x context_length x num_input_channels]
+        model_output = self.model(
+            past_values,
+            observed_mask=observed_mask,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )  # x.last_hidden_state: [batch_size x nvars x num_patch x d_model]
+        if isinstance(model_output, tuple):
+            model_output = PatchTSMixerModelOutput(*model_output)
+
+        x_hat = self.head(model_output.last_hidden_state)  # tensor [batch_size x nvars x num_patch x patch_length]
+
+        if return_loss is True:
+            loss_val = loss(x_hat, model_output.patch_input)
+        else:
+            loss_val = None
+
+        # calculate masked_loss
+        if self.masked_loss is True and loss_val is not None:
+            loss_val = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss_val,
+                    x_hat,
+                    model_output.last_hidden_state,
+                    model_output.hidden_states,
+                ]
+            )
+
+        return PatchTSMixerForPreTrainingOutput(
+            loss=loss_val,
+            prediction_outputs=x_hat,  # tensor [batch_size x nvars x num_patch x patch_length]
+            last_hidden_state=model_output.last_hidden_state,  # x: [batch_size x nvars x num_patch x d_model]
+            hidden_states=model_output.hidden_states,
+        )
+
+
+@dataclass
+class PatchTSMixerForPredictionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSMixerForPredictionOutput`].
+
+    Args:
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
+            Prediction output from the forecast head.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
+            Backbone embeddings before passing through the head.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
+            Total loss.
+        loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
+            Input mean
+        scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
+            Input std dev
+
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_outputs: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
+
+
+@dataclass
+class SamplePatchTSMixerPredictionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Args:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
+            Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
+
+
+@dataclass
+class SamplePatchTSMixerRegressionOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Args:
+        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_targets)`
+                Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+class PatchTSMixerForPrediction(PatchTSMixerPreTrainedModel):
+    r"""
+    `PatchTSMixer` for forecasting application.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    Returns:
+        `None`.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__(config)
+        self.loss = config.loss
+        self.use_return_dict = config.use_return_dict
+        self.prediction_channel_indices = config.prediction_channel_indices
+        self.num_parallel_samples = config.num_parallel_samples
+
+        if config.loss == "mse":
+            self.distribution_output = None
+        else:
+            dim = config.prediction_length
+            distribution_output_map = {
+                "student_t": StudentTOutput,
+                "normal": NormalOutput,
+                "negative_binomial": NegativeBinomialOutput,
+            }
+            output_class = distribution_output_map.get(config.distribution_output, None)
+            if output_class is not None:
+                self.distribution_output = output_class(dim=dim)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.model = PatchTSMixerModel(config)
+        self.head = PatchTSMixerForPredictionHead(
+            config=config,
+            distribution_output=self.distribution_output,
+        )
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @add_start_docstrings_to_model_forward(PATCHTSMIXER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PatchTSMixerForPredictionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = False,
+        return_loss: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> PatchTSMixerForPredictionOutput:
+        r"""
+        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+            in `[0, 1]`:
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+        future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
+            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*): Target
+            values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
+            required for a pretraining task.
+
+            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
+            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
+            pass the target data with all channels, as channel Filtering for both prediction and target will be
+            manually applied before the loss computation.
+        return_loss (`bool`,  *optional*):
+            Whether to return the loss in the `forward` call.
+
+        Returns:
+
+        """
+        if self.loss == "mse":
+            loss = nn.MSELoss(reduction="mean")
+        elif self.loss == "nll":
+            loss = nll
+        else:
+            raise ValueError("Invalid loss function: Allowed values: mse and nll")
+
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        # past_values: tensor [batch_size x context_length x num_input_channels]
+        model_output = self.model(
+            past_values,
+            observed_mask=observed_mask,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )  # model_output: [batch_size x nvars x num_patch x d_model]
+        if isinstance(model_output, tuple):
+            model_output = PatchTSMixerModelOutput(*model_output)
+
+        # tensor [batch_size x prediction_length x num_input_channels]
+        y_hat = self.head(model_output.last_hidden_state)
+
+        loss_val = None
+        if self.prediction_channel_indices is not None:
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(
+                    y_hat,
+                    loc=model_output.loc[..., self.prediction_channel_indices],
+                    scale=model_output.scale[..., self.prediction_channel_indices],
+                )
+                if future_values is not None and return_loss is True:
+                    loss_val = loss(
+                        distribution,
+                        future_values[..., self.prediction_channel_indices],
+                    )
+                    # take average of the loss
+                    loss_val = weighted_average(loss_val)
+            else:
+                y_hat = (
+                    y_hat * model_output.scale[..., self.prediction_channel_indices]
+                    + model_output.loc[..., self.prediction_channel_indices]
+                )
+                if future_values is not None and return_loss is True:
+                    loss_val = loss(y_hat, future_values[..., self.prediction_channel_indices])
+        else:
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
+                if future_values is not None and return_loss is True:
+                    loss_val = loss(distribution, future_values)
+                    loss_val = weighted_average(loss_val)
+            else:
+                y_hat = y_hat * model_output.scale + model_output.loc
+                if future_values is not None and return_loss is True:
+                    loss_val = loss(y_hat, future_values)
+
+        if self.prediction_channel_indices is not None:
+            loc = model_output.loc[..., self.prediction_channel_indices]
+            scale = model_output.scale[..., self.prediction_channel_indices]
+        else:
+            loc = model_output.loc
+            scale = model_output.scale
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss_val,
+                    y_hat,
+                    model_output.last_hidden_state,
+                    model_output.hidden_states,
+                    loc,
+                    scale,
+                ]
+            )
+
+        return PatchTSMixerForPredictionOutput(
+            loss=loss_val,
+            prediction_outputs=y_hat,  # tensor [batch_size x prediction_length x num_input_channels]
+            last_hidden_state=model_output.last_hidden_state,  # x: [batch_size x nvars x num_patch x d_model]
+            hidden_states=model_output.hidden_states,
+            loc=loc,
+            scale=scale,
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSMixerPredictionOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Args:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Past values of the time series that serves as context in order to predict the future.
+
+            observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        Return:
+            [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, prediction_length, num_input_channels)`.
+        """
+        # get number of samples
+        num_parallel_samples = self.num_parallel_samples
+
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            future_values=None,
+            observed_mask=observed_mask,
+            output_hidden_states=False,
+        )
+
+        # get distribution
+
+        distribution = self.distribution_output.distribution(
+            outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale
+        )
+
+        # get samples: list of [batch_size x prediction_length x num_channels]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]
+
+        # stack tensors
+        samples = torch.stack(samples, dim=1)  # [batch_size x num_samples x prediction_length x num_channels]
+        return SamplePatchTSMixerPredictionOutput(sequences=samples)
+
+
+@dataclass
+class PatchTSMixerForTimeSeriesClassificationOutput(ModelOutput):
+    """
+    Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].
+
+    Args:
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
+            Prediction output from the classfication head.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
+            Backbone embeddings before passing through the head.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
+            Total loss.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_outputs: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class PatchTSMixerForTimeSeriesClassification(PatchTSMixerPreTrainedModel):
+    r"""
+    `PatchTSMixer` for classification application.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    Returns:
+        `None`.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__(config)
+
+        self.model = PatchTSMixerModel(config)
+        self.head = PatchTSMixerLinearHead(
+            config=config,
+        )
+        self.use_return_dict = config.use_return_dict
+        if config.scaling in ["std", "mean", True]:
+            self.inject_scale = InjectScalerStatistics4D(d_model=config.d_model, num_patches=config.num_patches)
+        else:
+            self.inject_scale = None
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @add_start_docstrings_to_model_forward(PATCHTSMIXER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(
+        output_type=PatchTSMixerForTimeSeriesClassificationOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: torch.Tensor = None,
+        output_hidden_states: Optional[bool] = False,
+        return_loss: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> PatchTSMixerForTimeSeriesClassificationOutput:
+        r"""
+        future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
+            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*): Target
+            values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
+            required for a pretraining task.
+
+            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
+            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
+            pass the target data with all channels, as channel Filtering for both prediction and target will be
+            manually applied before the loss computation.
+
+            For a classification task, it has a shape of `(batch_size,)`.
+
+            For a regression task, it has a shape of `(batch_size, num_targets)`.
+        return_loss (`bool`, *optional*):
+            Whether to return the loss in the `forward` call.
+
+        Returns:
+
+        """
+
+        loss = torch.nn.CrossEntropyLoss()
+
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+
+        model_output = self.model(
+            past_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )  # x: [batch_size x nvars x num_patch x d_model]
+        if isinstance(model_output, tuple):
+            model_output = PatchTSMixerModelOutput(*model_output)
+
+        if self.inject_scale is not None:
+            model_output.last_hidden_state = self.inject_scale(
+                model_output.last_hidden_state,
+                loc=model_output.loc,
+                scale=model_output.scale,
+            )  # x: [batch_size x nvars x num_patch x d_model]
+
+        y_hat = self.head(model_output.last_hidden_state)  # tensor [batch_size x n_labels]
+
+        if future_values is not None and return_loss is True:
+            loss_val = loss(y_hat, future_values)
+        else:
+            loss_val = None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss_val,
+                    y_hat,
+                    model_output.last_hidden_state,
+                    model_output.hidden_states,
+                ]
+            )
+
+        return PatchTSMixerForTimeSeriesClassificationOutput(
+            loss=loss_val,
+            prediction_outputs=y_hat,  # tensor [batch_size x n_labels]
+            last_hidden_state=model_output.last_hidden_state,  # x: [batch_size x nvars x num_patch x d_model]
+            hidden_states=model_output.hidden_states,
+        )
+
+
+@dataclass
+class PatchTSMixerForRegressionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSMixerForRegressionOutput`].
+
+    Args:
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
+            Prediction output from the regression head.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
+            Backbone embeddings before passing through the head.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
+            Total loss.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_outputs: torch.FloatTensor = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class InjectScalerStatistics4D(nn.Module):
+    def __init__(self, d_model: int, num_patches: int, expansion: int = 2):
+        super().__init__()
+
+        self.inverse_trans_expansion = nn.Linear(d_model + 2, expansion * d_model)
+        self.inverse_trans_compression = nn.Linear(expansion * d_model, d_model)
+        self.map_scale_expansion = nn.Linear(2, 2 * expansion)
+        self.map_scale_compression = nn.Linear(2 * expansion, 2)
+        self.num_patches = num_patches
+
+    def forward(self, inputs: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor):
+        """
+        Args:
+            inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
+            loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
+            scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
+        """
+
+        mean = loc.transpose(-1, -2)  # [batch_size x n_channels x 1 ]
+        mean = mean.unsqueeze(-2)  # [batch_size x n_channels x 1 x 1]
+        mean = mean.repeat(1, 1, self.num_patches, 1)  # [batch_size x n_channels x num_patch x 1]
+
+        stdev = scale.transpose(-1, -2)  # [batch_size x n_channels x 1 ]
+        stdev = stdev.unsqueeze(-2)  # [batch_size x n_channels x 1 x 1]
+        stdev = stdev.repeat(1, 1, self.num_patches, 1)  # [batch_size x n_channels x num_patch x 1]
+
+        concat_stats = torch.cat([mean, stdev], dim=-1)  # [batch_size x n_channels x num_patch x 2]
+
+        concat_stats = self.map_scale_expansion(concat_stats)  # [batch_size x n_channels x num_patch x (2*expansion)]
+        concat_stats = self.map_scale_compression(concat_stats)  # [batch_size x n_channels x num_patch x 2]
+
+        inputs = torch.cat([inputs, concat_stats], dim=-1)  # [batch_size x channels x num_patch x d_model+2]
+        inputs = self.inverse_trans_expansion(inputs)  # [batch_size x channels x num_patch x (expansion*d_model)]
+        inputs = self.inverse_trans_compression(inputs)  # [batch_size x channels x num_patch x d_model]
+
+        return inputs
+
+
+class PatchTSMixerForRegression(PatchTSMixerPreTrainedModel):
+    r"""
+    `PatchTSMixer` for regression application.
+
+    Args:
+        config (`PatchTSMixerConfig`, *required*):
+            Configuration.
+
+    Returns:
+        `None`.
+    """
+
+    def __init__(self, config: PatchTSMixerConfig):
+        super().__init__(config)
+
+        self.model = PatchTSMixerModel(config)
+
+        self.loss = config.loss
+        self.distribution_output = config.distribution_output
+
+        self.use_return_dict = config.use_return_dict
+        self.num_parallel_samples = config.num_parallel_samples
+
+        if config.loss == "mse":
+            self.distribution_output = None
+        else:
+            distribution_output_map = {
+                "student_t": StudentTOutput,
+                "normal": NormalOutput,
+                "negative_binomial": NegativeBinomialOutput,
+            }
+            output_class = distribution_output_map.get(config.distribution_output)
+            if output_class is not None:
+                self.distribution_output = output_class(dim=config.num_targets)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        if config.scaling in ["std", "mean", True]:
+            self.inject_scale = InjectScalerStatistics4D(d_model=config.d_model, num_patches=config.num_patches)
+        else:
+            self.inject_scale = None
+
+        self.head = PatchTSMixerLinearHead(
+            config=config,
+            distribution_output=self.distribution_output,
+        )
+
+        # Initialize weights and apply final processing
+        if config.post_init:
+            self.post_init()
+
+    @add_start_docstrings_to_model_forward(PATCHTSMIXER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=PatchTSMixerForRegressionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        future_values: torch.Tensor = None,
+        output_hidden_states: Optional[bool] = False,
+        return_loss: bool = True,
+        return_dict: Optional[bool] = None,
+    ) -> PatchTSMixerForRegressionOutput:
+        r"""
+        future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
+            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*): Target
+            values of the time series, that serve as labels for the model. The `future_values` is what the
+            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
+            required for a pretraining task.
+
+            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
+            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
+            pass the target data with all channels, as channel Filtering for both prediction and target will be
+            manually applied before the loss computation.
+
+            For a classification task, it has a shape of `(batch_size,)`.
+
+            For a regression task, it has a shape of `(batch_size, num_targets)`.
+        return_loss (`bool`, *optional*):
+            Whether to return the loss in the `forward` call.
+
+        Returns:
+
+        """
+
+        if self.loss == "mse":
+            loss = nn.MSELoss(reduction="mean")
+        elif self.loss == "nll":
+            loss = nll
+        else:
+            raise ValueError("Invalid loss function: Allowed values: mse and nll")
+
+        return_dict = return_dict if return_dict is not None else self.use_return_dict
+        model_output = self.model(
+            past_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )  # model_output: [batch_size x nvars x num_patch x d_model]
+        if isinstance(model_output, tuple):
+            model_output = PatchTSMixerModelOutput(*model_output)
+
+        if self.inject_scale is not None:
+            model_output.last_hidden_state = self.inject_scale(
+                model_output.last_hidden_state,
+                loc=model_output.loc,
+                scale=model_output.scale,
+            )  # x: [batch_size x nvars x num_patch x d_model]
+
+        y_hat = self.head(model_output.last_hidden_state)  # [batch_size x num_targets]
+
+        if future_values is not None and return_loss is True:
+            if self.distribution_output:
+                if self.distribution_output == "negative_binomial" and torch.any(future_values < 0):
+                    raise Exception("future_values cannot be negative for negative_binomial distribution.")
+                distribution = self.distribution_output.distribution(y_hat)
+                loss_val = loss(distribution, future_values)
+                # take average of the loss
+                loss_val = weighted_average(loss_val)
+            else:
+                loss_val = loss(y_hat, future_values)
+        else:
+            loss_val = None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss_val,
+                    y_hat,
+                    model_output.last_hidden_state,
+                    model_output.hidden_states,
+                ]
+            )
+
+        return PatchTSMixerForRegressionOutput(
+            loss=loss_val,
+            prediction_outputs=y_hat,  # tensor [batch_size x num_targets]
+            last_hidden_state=model_output.last_hidden_state,  # [batch_size x nvars x num_patch x d_model]
+            hidden_states=model_output.hidden_states,
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+    ) -> SamplePatchTSMixerRegressionOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Args:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Past values of the time series that serves as context in order to predict the future.
+
+        Return:
+            [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
+            number of samples, num_targets)`.
+        """
+        # get number of samples
+        num_parallel_samples = self.num_parallel_samples
+
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            future_values=None,
+            output_hidden_states=False,
+        )
+
+        # get distribution
+        distribution = self.distribution_output.distribution(outputs.prediction_outputs)
+
+        # get samples
+        samples = [
+            distribution.sample() for _ in range(num_parallel_samples)
+        ]  # samples: list of [batch_size x num_targets]
+        # stack tensors
+        samples = torch.stack(samples, dim=1)  # [batch_size x num_samples x num_targets]
+        return SamplePatchTSMixerRegressionOutput(sequences=samples)
diff --git a/src/transformers/models/patchtst/__init__.py b/src/transformers/models/patchtst/__init__.py
new file mode 100644
index 00000000000000..8c7db64c198406
--- /dev/null
+++ b/src/transformers/models/patchtst/__init__.py
@@ -0,0 +1,66 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+# rely on isort to merge the imports
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {
+    "configuration_patchtst": [
+        "PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "PatchTSTConfig",
+    ],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_patchtst"] = [
+        "PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "PatchTSTModel",
+        "PatchTSTPreTrainedModel",
+        "PatchTSTForPrediction",
+        "PatchTSTForPretraining",
+        "PatchTSTForRegression",
+        "PatchTSTForClassification",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_patchtst import PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP, PatchTSTConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_patchtst import (
+            PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST,
+            PatchTSTForClassification,
+            PatchTSTForPrediction,
+            PatchTSTForPretraining,
+            PatchTSTForRegression,
+            PatchTSTModel,
+            PatchTSTPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/patchtst/configuration_patchtst.py b/src/transformers/models/patchtst/configuration_patchtst.py
new file mode 100644
index 00000000000000..5cf949304e91fe
--- /dev/null
+++ b/src/transformers/models/patchtst/configuration_patchtst.py
@@ -0,0 +1,262 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PatchTST model configuration"""
+
+from typing import List, Optional, Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+PATCHTST_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ibm/patchtst-base": "https://huggingface.co/ibm/patchtst-base/resolve/main/config.json",
+    # See all PatchTST models at https://huggingface.co/ibm/models?filter=patchtst
+}
+
+
+class PatchTSTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`PatchTSTModel`]. It is used to instantiate an
+    PatchTST model according to the specified arguments, defining the model architecture.
+    [ibm/patchtst](https://huggingface.co/ibm/patchtst) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_input_channels (`int`, *optional*, defaults to 1):
+            The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of
+            multivariate targets.
+        context_length (`int`, *optional*, defaults to 32):
+            The context length of the input sequence.
+        distribution_output (`str`, *optional*, defaults to `"student_t"`):
+            The distribution emission head for the model when loss is "nll". Could be either "student_t", "normal" or
+            "negative_binomial".
+        loss (`str`, *optional*, defaults to `"mse"`):
+            The loss function for the model corresponding to the `distribution_output` head. For parametric
+            distributions it is the negative log likelihood ("nll") and for point estimates it is the mean squared
+            error "mse".
+        patch_length (`int`, *optional*, defaults to 1):
+            Define the patch length of the patchification process.
+        patch_stride (`int`, *optional*, defaults to 1):
+            Define the stride of the patchification process.
+        num_hidden_layers (`int`, *optional*, defaults to 3):
+            Number of hidden layers.
+        d_model (`int`, *optional*, defaults to 128):
+            Dimensionality of the transformer layers.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        share_embedding (`bool`, *optional*, defaults to `True`):
+            Sharing the input embedding across all channels.
+        channel_attention (`bool`, *optional*, defaults to `False`):
+            Activate channel attention block in the Transformer to allow channels to attend each other.
+        ffn_dim (`int`, *optional*, defaults to 512):
+            Dimension of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        norm_type (`str` , *optional*, defaults to `"batchnorm"`):
+            Normalization at each Transformer layer. Can be `"batchnorm"` or `"layernorm"`.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            A value added to the denominator for numerical stability of normalization.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the Transformer.
+        positional_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability in the positional embedding layer.
+        path_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout path in the residual block.
+        ff_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability used between the two layers of the feed-forward networks.
+        bias (`bool`, *optional*, defaults to `True`):
+            Whether to add bias in the feed-forward networks.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (string) in the Transformer.`"gelu"` and `"relu"` are supported.
+        pre_norm (`bool`, *optional*, defaults to `True`):
+            Normalization is applied before self-attention if pre_norm is set to `True`. Otherwise, normalization is
+            applied after residual block.
+        positional_encoding_type (`str`, *optional*, defaults to `"sincos"`):
+            Positional encodings. Options `"random"` and `"sincos"` are supported.
+        use_cls_token (`bool`, *optional*, defaults to `False`):
+            Whether cls token is used.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated normal weight initialization distribution.
+        share_projection (`bool`, *optional*, defaults to `True`):
+            Sharing the projection layer across different channels in the forecast head.
+        scaling (`Union`, *optional*, defaults to `"std"`):
+            Whether to scale the input targets via "mean" scaler, "std" scaler or no scaler if `None`. If `True`, the
+            scaler is set to "mean".
+        do_mask_input (`bool`, *optional*):
+            Apply masking during the pretraining.
+        mask_type (`str`, *optional*, defaults to `"random"`):
+            Masking type. Only `"random"` and `"forecast"` are currently supported.
+        random_mask_ratio (`float`, *optional*, defaults to 0.5):
+            Masking ratio applied to mask the input data during random pretraining.
+        num_forecast_mask_patches (`int` or `list`, *optional*, defaults to `[2]`):
+            Number of patches to be masked at the end of each batch sample. If it is an integer,
+            all the samples in the batch will have the same number of masked patches. If it is a list,
+            samples in the batch will be randomly masked by numbers defined in the list. This argument is only used
+            for forecast pretraining.
+        channel_consistent_masking (`bool`, *optional*, defaults to `False`):
+            If channel consistent masking is True, all the channels will have the same masking pattern.
+        unmasked_channel_indices (`list`, *optional*):
+            Indices of channels that are not masked during pretraining. Values in the list are number between 1 and
+            `num_input_channels`
+        mask_value (`int`, *optional*, defaults to 0):
+            Values in the masked patches will be filled by `mask_value`.
+        pooling_type (`str`, *optional*, defaults to `"mean"`):
+            Pooling of the embedding. `"mean"`, `"max"` and `None` are supported.
+        head_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for head.
+        prediction_length (`int`, *optional*, defaults to 24):
+            The prediction horizon that the model will output.
+        num_targets (`int`, *optional*, defaults to 1):
+            Number of targets for regression and classification tasks. For classification, it is the number of
+            classes.
+        output_range (`list`, *optional*):
+            Output range for regression task. The range of output values can be set to enforce the model to produce
+            values within a range.
+        num_parallel_samples (`int`, *optional*, defaults to 100):
+            The number of samples is generated in parallel for probabilistic prediction.
+
+
+    ```python
+    >>> from transformers import PatchTSTConfig, PatchTSTModel
+
+    >>> # Initializing an PatchTST configuration with 12 time steps for prediction
+    >>> configuration = PatchTSTConfig(prediction_length=12)
+
+    >>> # Randomly initializing a model (with random weights) from the configuration
+    >>> model = PatchTSTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "patchtst"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_attention_heads",
+        "num_hidden_layers": "num_hidden_layers",
+    }
+
+    def __init__(
+        self,
+        # time series specific configuration
+        num_input_channels: int = 1,
+        context_length: int = 32,
+        distribution_output: str = "student_t",
+        loss: str = "mse",
+        # PatchTST arguments
+        patch_length: int = 1,
+        patch_stride: int = 1,
+        # Transformer architecture configuration
+        num_hidden_layers: int = 3,
+        d_model: int = 128,
+        num_attention_heads: int = 4,
+        share_embedding: bool = True,
+        channel_attention: bool = False,
+        ffn_dim: int = 512,
+        norm_type: str = "batchnorm",
+        norm_eps: float = 1e-05,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+        positional_dropout: float = 0.0,
+        path_dropout: float = 0.0,
+        ff_dropout: float = 0.0,
+        bias: bool = True,
+        activation_function: str = "gelu",
+        pre_norm: bool = True,
+        positional_encoding_type: str = "sincos",
+        use_cls_token: bool = False,
+        init_std: float = 0.02,
+        share_projection: bool = True,
+        scaling: Optional[Union[str, bool]] = "std",
+        # mask pretraining
+        do_mask_input: Optional[bool] = None,
+        mask_type: str = "random",
+        random_mask_ratio: float = 0.5,
+        num_forecast_mask_patches: Optional[Union[List[int], int]] = [2],
+        channel_consistent_masking: Optional[bool] = False,
+        unmasked_channel_indices: Optional[List[int]] = None,
+        mask_value: int = 0,
+        # head
+        pooling_type: str = "mean",
+        head_dropout: float = 0.0,
+        prediction_length: int = 24,
+        num_targets: int = 1,
+        output_range: Optional[List] = None,
+        # distribution head
+        num_parallel_samples: int = 100,
+        **kwargs,
+    ):
+        # time series specific configuration
+        self.context_length = context_length
+        self.num_input_channels = num_input_channels  # n_vars
+        self.loss = loss
+        self.distribution_output = distribution_output
+        self.num_parallel_samples = num_parallel_samples
+
+        # Transformer architecture configuration
+        self.d_model = d_model
+        self.num_attention_heads = num_attention_heads
+        self.ffn_dim = ffn_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.share_embedding = share_embedding
+        self.channel_attention = channel_attention
+        self.norm_type = norm_type
+        self.norm_eps = norm_eps
+        self.positional_dropout = positional_dropout
+        self.path_dropout = path_dropout
+        self.ff_dropout = ff_dropout
+        self.bias = bias
+        self.activation_function = activation_function
+        self.pre_norm = pre_norm
+        self.positional_encoding_type = positional_encoding_type
+        self.use_cls_token = use_cls_token
+        self.init_std = init_std
+        self.scaling = scaling
+
+        # PatchTST parameters
+        self.patch_length = patch_length
+        self.patch_stride = patch_stride
+
+        # Mask pretraining
+        self.do_mask_input = do_mask_input
+        self.mask_type = mask_type
+        self.random_mask_ratio = random_mask_ratio  # for random masking
+        self.num_forecast_mask_patches = num_forecast_mask_patches  # for forecast masking
+        self.channel_consistent_masking = channel_consistent_masking
+        self.unmasked_channel_indices = unmasked_channel_indices
+        self.mask_value = mask_value
+
+        # general head params
+        self.pooling_type = pooling_type
+        self.head_dropout = head_dropout
+
+        # For prediction head
+        self.share_projection = share_projection
+        self.prediction_length = prediction_length
+
+        # For prediction and regression head
+        self.num_parallel_samples = num_parallel_samples
+
+        # Regression
+        self.num_targets = num_targets
+        self.output_range = output_range
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/patchtst/modeling_patchtst.py b/src/transformers/models/patchtst/modeling_patchtst.py
new file mode 100755
index 00000000000000..3ace6d9546ac14
--- /dev/null
+++ b/src/transformers/models/patchtst/modeling_patchtst.py
@@ -0,0 +1,2034 @@
+# coding=utf-8
+# Copyright 2023 IBM & Hugging Face. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch PatchTST model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2CLS
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
+from ...utils import ModelOutput, add_start_docstrings, logging
+from .configuration_patchtst import PatchTSTConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "PatchTSTConfig"
+
+PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "ibm/patchtst-etth1-pretrain",
+    # See all PatchTST models at https://huggingface.co/models?filter=patchtst
+]
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->PatchTST
+class PatchTSTAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[PatchTSTConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_value
+
+
+class PatchTSTBatchNorm(nn.Module):
+    """
+    Compute batch normalization over the sequence length (time) dimension.
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.batchnorm = nn.BatchNorm1d(config.d_model, eps=config.norm_eps)
+
+    def forward(self, inputs: torch.Tensor):
+        """
+        Parameters:
+            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
+                input for Batch norm calculation
+        Returns:
+            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
+        """
+        output = inputs.transpose(1, 2)  # output: (batch_size, d_model, sequence_length)
+        output = self.batchnorm(output)
+        return output.transpose(1, 2)
+
+
+def random_masking(
+    inputs: torch.Tensor,
+    mask_ratio: float,
+    unmasked_channel_indices: list = None,
+    channel_consistent_masking: bool = False,
+    mask_value: int = 0,
+):
+    """random_masking: Mask the input considering the control variables.
+
+    Args:
+        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
+            The input tensor to mask.
+        mask_ratio (`float`):
+            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
+        unmasked_channel_indices (list, *optional*):
+            Indices of channels that will not be masked.
+        channel_consistent_masking (bool, *optional*, defaults to `False`):
+            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
+            across channels.
+        mask_value (int, *optional*, defaults to 0):
+            Define the value of masked patches for pretraining.
+
+    Returns:
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
+        n]
+    """
+    if mask_ratio < 0 or mask_ratio >= 1:
+        raise ValueError(f"Mask ratio {mask_ratio} has to be between 0 and 1.")
+
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
+    device = inputs.device
+
+    len_keep = int(sequence_length * (1 - mask_ratio))
+
+    if channel_consistent_masking:
+        noise = torch.rand(batch_size, 1, sequence_length, device=device)  # noise in [0, 1], bs x 1 x  L
+        noise = noise.repeat(1, num_channels, 1)  # bs x num_channels x time
+    else:
+        # noise in [0, 1], bs x num_channels x L
+        noise = torch.rand(batch_size, num_channels, sequence_length, device=device)
+
+    # mask: [bs x num_channels x num_patch]
+    mask = torch.ones(batch_size, num_channels, sequence_length, device=device)
+    mask[:, :, :len_keep] = 0
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=-1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=-1)  # ids_restore: [bs x num_channels x L]
+
+    mask = torch.gather(mask, dim=-1, index=ids_restore)
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patches x patch_length]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
+
+
+def forecast_masking(
+    inputs: torch.Tensor,
+    num_forecast_mask_patches: Union[list, int],
+    unmasked_channel_indices: list = None,
+    mask_value: int = 0,
+):
+    """Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
+    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.
+
+    Parameters:
+        inputs (`torch.Tensor`):
+            Input of shape `(bs, num_channels, num_patch, patch_len)`
+        num_forecast_mask_patches (`list`):
+            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
+        unmasked_channel_indices (`list`, *optional*):
+            Indices of channels that are not masked.
+        mask_value (`int`, *optional*, defaults to 0):
+            Values in the masked patches will be filled by `mask_value`.
+
+    Returns:
+        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
+        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
+    """
+
+    if isinstance(num_forecast_mask_patches, int):
+        num_forecast_mask_patches = [num_forecast_mask_patches]
+    forecast_mask_ratios = [1 for _ in num_forecast_mask_patches]
+
+    batch_size, num_channels, sequence_length, num_features = inputs.shape
+    mask = torch.zeros(batch_size, num_channels, sequence_length, device=inputs.device)
+
+    t_list = []
+    total_length = 0
+    total_ratio = sum(forecast_mask_ratios)
+
+    for patch_length, ratio in zip(num_forecast_mask_patches, forecast_mask_ratios):
+        if patch_length <= 0 or patch_length >= sequence_length:
+            raise ValueError(
+                f"num_forecast_mask_patches {patch_length} should be greater than 0 and less than total patches."
+            )
+        temp_len = int(batch_size * ratio / total_ratio)
+        t_list.append([patch_length, ratio, temp_len])
+        total_length += temp_len
+
+    t_list = sorted(t_list, key=lambda x: x[2])
+
+    if total_length < batch_size:
+        t_list[0][2] = t_list[0][2] + (batch_size - total_length)
+    elif total_length > batch_size:
+        t_list[-1][2] = t_list[-1][2] + (total_length - batch_size)
+
+    batch1 = 0
+    for patch_len, _, temp_len in t_list:
+        batch2 = batch1 + temp_len
+        mask[batch1:batch2, :, -patch_len:] = 1
+        batch1 = batch2
+
+    perm = torch.randperm(mask.shape[0])
+    mask = mask[perm]
+
+    mask = mask.unsqueeze(-1).repeat(1, 1, 1, num_features)  # mask: [bs x num_channels x num_patch x patch_len]
+    if unmasked_channel_indices is not None:
+        mask[:, unmasked_channel_indices, :, :] = 0
+
+    inputs_mask = inputs.masked_fill(mask.bool(), mask_value)
+    return inputs_mask, mask[..., 0]
+
+
+class PatchTSTPatchify(nn.Module):
+    """
+    A class to patchify the time series sequence into different patches
+
+    Returns:
+        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+
+        self.sequence_length = config.context_length
+        self.patch_length = config.patch_length
+        self.patch_stride = config.patch_stride
+
+        if self.sequence_length <= self.patch_length:
+            raise ValueError(
+                f"Sequence length ({self.sequence_length}) has to be greater than the patch length ({self.patch_length})"
+            )
+
+        # get the number of patches
+        self.num_patches = (max(self.sequence_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
+        new_sequence_length = self.patch_length + self.patch_stride * (self.num_patches - 1)
+        self.sequence_start = self.sequence_length - new_sequence_length
+
+    def forward(self, past_values: torch.Tensor):
+        """
+        Parameters:
+            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
+                Input for patchification
+
+        Returns:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
+        """
+        sequence_length = past_values.shape[-2]
+        if sequence_length != self.sequence_length:
+            raise ValueError(
+                f"Input sequence length ({sequence_length}) doesn't match model configuration ({self.sequence_length})."
+            )
+        # output: [bs x new_sequence_length x num_channels]
+        output = past_values[:, self.sequence_start :, :]
+        # output: [bs x num_patches x num_input_channels x patch_length]
+        output = output.unfold(dimension=-2, size=self.patch_length, step=self.patch_stride)
+        # output: [bs x num_input_channels x num_patches x patch_length]
+        output = output.transpose(-2, -3).contiguous()
+        return output
+
+
+class PatchTSTMasking(nn.Module):
+    """
+    Class to perform random or forecast masking.
+
+    Parameters:
+        config (`PatchTSTConfig`): model config
+    Returns:
+        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
+            Masked patched input
+        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
+            Bool tensor indicating True on masked points
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.random_mask_ratio = config.random_mask_ratio
+        self.channel_consistent_masking = config.channel_consistent_masking
+        self.mask_type = config.mask_type
+        self.num_forecast_mask_patches = config.num_forecast_mask_patches
+        self.unmasked_channel_indices = config.unmasked_channel_indices
+        self.mask_value = config.mask_value
+        if self.unmasked_channel_indices is not None:
+            self.unmasked_channel_indices = sorted(self.unmasked_channel_indices)
+
+    def forward(self, patch_input: torch.Tensor):
+        """
+        Parameters:
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Patch input
+
+        Return:
+            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
+                Masked patched input
+            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
+                Bool tensor indicating True on masked points
+
+        """
+        if self.mask_type == "random":
+            masked_input, mask = random_masking(
+                inputs=patch_input,
+                mask_ratio=self.random_mask_ratio,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                channel_consistent_masking=self.channel_consistent_masking,
+                mask_value=self.mask_value,
+            )
+        elif self.mask_type == "forecast":
+            masked_input, mask = forecast_masking(
+                inputs=patch_input,
+                num_forecast_mask_patches=self.num_forecast_mask_patches,
+                unmasked_channel_indices=self.unmasked_channel_indices,
+                mask_value=self.mask_value,
+            )
+        else:
+            raise ValueError(f"Invalid mask type {self.mask_type}.")
+
+        # mask: [bs x num_input_channels x num_patch]
+        mask = mask.bool()
+        return masked_input, mask
+
+
+class PatchTSTEncoderLayer(nn.Module):
+    """
+    PatchTST encoder layer
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+
+        self.channel_attention = config.channel_attention
+        # Multi-Head attention
+        self.self_attn = PatchTSTAttention(
+            embed_dim=config.d_model,
+            num_heads=config.num_attention_heads,
+            dropout=config.attention_dropout,
+        )
+
+        # Add & Norm of the sublayer 1
+        self.dropout_path1 = nn.Dropout(config.path_dropout) if config.path_dropout > 0 else nn.Identity()
+        if config.norm_type == "batchnorm":
+            self.norm_sublayer1 = PatchTSTBatchNorm(config)
+        elif config.norm_type == "layernorm":
+            self.norm_sublayer1 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
+        else:
+            raise ValueError(f"{config.norm_type} is not a supported norm layer type.")
+
+        # Add & Norm of the sublayer 2
+        if self.channel_attention:
+            self.dropout_path2 = nn.Dropout(config.path_dropout) if config.path_dropout > 0 else nn.Identity()
+            if config.norm_type == "batchnorm":
+                self.norm_sublayer2 = PatchTSTBatchNorm(config)
+            elif config.norm_type == "layernorm":
+                self.norm_sublayer2 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
+            else:
+                raise ValueError(f"{config.norm_type} is not a supported norm layer type.")
+
+        # Position-wise Feed-Forward
+        self.ff = nn.Sequential(
+            nn.Linear(config.d_model, config.ffn_dim, bias=config.bias),
+            ACT2CLS[config.activation_function](),
+            nn.Dropout(config.ff_dropout) if config.ff_dropout > 0 else nn.Identity(),
+            nn.Linear(config.ffn_dim, config.d_model, bias=config.bias),
+        )
+
+        # Add & Norm of sublayer 3
+        self.dropout_path3 = nn.Dropout(config.path_dropout) if config.path_dropout > 0 else nn.Identity()
+        if config.norm_type == "batchnorm":
+            self.norm_sublayer3 = PatchTSTBatchNorm(config)
+        elif config.norm_type == "layernorm":
+            self.norm_sublayer3 = nn.LayerNorm(config.d_model, eps=config.norm_eps)
+        else:
+            raise ValueError(f"{config.norm_type} is not a supported norm layer type.")
+
+        self.pre_norm = config.pre_norm
+
+    def forward(self, hidden_state: torch.Tensor, output_attentions: Optional[bool] = None):
+        """
+        Parameters:
+            hidden_state (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`, *required*):
+                Past values of the time series
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+        Return:
+            `torch.Tensor` of shape `(batch_size, num_channels, sequence_length, d_model)`
+
+        """
+        batch_size, num_input_channels, sequence_length, d_model = hidden_state.shape
+
+        # First sublayer: attention across time
+        # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+        hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model)
+
+        if self.pre_norm:
+            ## Norm and Multi-Head attention and Add residual connection
+            attn_output, attn_weights, _ = self.self_attn(
+                hidden_states=self.norm_sublayer1(hidden_state), output_attentions=output_attentions
+            )
+            # Add: residual connection with residual dropout
+            hidden_state = hidden_state + self.dropout_path1(attn_output)
+        else:
+            ## Multi-Head attention and Add residual connection and Norm - Standard Transformer from BERT
+            attn_output, attn_weights, _ = self.self_attn(
+                hidden_states=hidden_state, output_attentions=output_attentions
+            )
+            # hidden_states: [(bs*num_channels) x sequence_length x d_model]
+            hidden_state = self.norm_sublayer1(hidden_state + self.dropout_path1(attn_output))
+
+        # hidden_state: [bs x num_channels x sequence_length x d_model]
+        hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model)
+
+        # second sublayer: attention across variable at any given time
+        if self.channel_attention:
+            # hidden_state: [bs x sequence_length x num_channels x d_model]
+            hidden_state = hidden_state.transpose(2, 1).contiguous()
+            # hidden_state: [(bs*sequence_length) x num_channels x d_model]
+            hidden_state = hidden_state.view(batch_size * sequence_length, num_input_channels, d_model)
+            if self.pre_norm:
+                ## Norm and Multi-Head attention and Add residual connection
+                attn_output, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=self.norm_sublayer2(hidden_state), output_attentions=output_attentions
+                )
+                # Add: residual connection with residual dropout
+                hidden_state = hidden_state + self.dropout_path2(attn_output)
+            else:
+                ## Multi-Head attention and Add residual connection and Norm
+                attn_output, channel_attn_weights, _ = self.self_attn(
+                    hidden_states=hidden_state, output_attentions=output_attentions
+                )
+                # hidden_states: [(bs*sequence_length) x num_channels x d_model]
+                hidden_state = self.norm_sublayer2(hidden_state + self.dropout_path2(attn_output))
+
+            # Reshape hidden state
+            # hidden_state: [bs x sequence_length x num_channels x d_model]
+            hidden_state = hidden_state.reshape(batch_size, sequence_length, num_input_channels, d_model)
+            # hidden_state: [bs x num_channels x sequence_length x d_model]
+            hidden_state = hidden_state.transpose(1, 2).contiguous()
+
+        # Third sublayer: mixing across hidden
+        # hidden_state: [(batch_size*num_channels) x sequence_length x d_model]
+        hidden_state = hidden_state.view(batch_size * num_input_channels, sequence_length, d_model)
+        if self.pre_norm:
+            ## Norm and Position-wise Feed-Forward and Add residual connection
+            # Add: residual connection with residual dropout
+            hidden_state = hidden_state + self.dropout_path3(self.ff(self.norm_sublayer3(hidden_state)))
+        else:
+            ## Position-wise Feed-Forward and Add residual connection and Norm
+            # Add: residual connection with residual dropout
+            hidden_state = self.norm_sublayer3(hidden_state + self.dropout_path3(self.ff(hidden_state)))
+
+        # [bs x num_channels x sequence_length x d_model]
+        hidden_state = hidden_state.reshape(batch_size, num_input_channels, sequence_length, d_model)
+
+        outputs = (hidden_state,)
+        if output_attentions:
+            outputs += (attn_weights, channel_attn_weights) if self.channel_attention else (attn_weights,)
+
+        return outputs
+
+
+class PatchTSTPreTrainedModel(PreTrainedModel):
+    config_class = PatchTSTConfig
+    base_model_prefix = "model"
+    main_input_name = "past_values"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """
+        Initialize weights
+        """
+        if isinstance(module, PatchTSTPositionalEncoding):
+            # initialize cls_token
+            if self.config.use_cls_token:
+                nn.init.normal_(module.cls_token, std=0.02)
+            # initialize positional encoding
+            if self.config.positional_encoding_type == "random":
+                nn.init.normal_(module.position_enc, mean=0.0, std=0.1)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, PatchTSTBatchNorm):
+            module.batchnorm.bias.data.zero_()
+            module.batchnorm.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (PatchTSTEncoder)):
+            module.gradient_checkpointing = value
+
+
+class PatchTSTEmbedding(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.num_input_channels = config.num_input_channels
+        self.share_embedding = config.share_embedding
+        # Input encoding: projection of feature vectors onto a d-dim vector space
+        if self.share_embedding:
+            self.input_embedding = nn.Linear(config.patch_length, config.d_model)
+        else:
+            self.input_embedding = nn.ModuleList()
+            for _ in range(config.num_input_channels):
+                self.input_embedding.append(nn.Linear(config.patch_length, config.d_model))
+
+    def forward(self, patch_input: torch.Tensor):
+        """
+        Parameters:
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Patch input for embedding
+        return:
+            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, d_model)`
+        """
+        # Input encoding
+        num_input_channels = patch_input.shape[1]
+        if num_input_channels != self.num_input_channels:
+            raise ValueError(
+                f"The defined number of input channels ({self.num_input_channels}) in the config "
+                f"has to be the same as the number of channels in the batch input ({num_input_channels})"
+            )
+        if self.share_embedding:
+            embeddings = self.input_embedding(patch_input)  # x: [bs x num_channels  x num_patches x d_model]
+        else:
+            embeddings = [self.input_embedding[i](patch_input[:, i, :, :]) for i in range(num_input_channels)]
+            embeddings = torch.stack(embeddings, dim=1)
+        return embeddings
+
+
+class PatchTSTPositionalEncoding(nn.Module):
+    """
+    Class for positional encoding
+    """
+
+    def __init__(self, config: PatchTSTConfig, num_patches: int):
+        super().__init__()
+        self.use_cls_token = config.use_cls_token
+        self.num_input_channels = config.num_input_channels
+        if config.use_cls_token:
+            # cls_token: [1 x num_input_channels x 1 x d_model]
+            self.cls_token = nn.Parameter(torch.zeros(1, 1, 1, config.d_model))
+            num_patches += 1
+        # postional encoding: [num_patches x d_model]
+        self.position_enc = self._init_pe(config, num_patches)
+        # Positional dropout
+        self.positional_dropout = (
+            nn.Dropout(config.positional_dropout) if config.positional_dropout > 0 else nn.Identity()
+        )
+
+    @staticmethod
+    def _init_pe(config: PatchTSTConfig, num_patches: int) -> nn.Parameter:
+        # Positional encoding
+        if config.positional_encoding_type == "random":
+            position_enc = nn.Parameter(torch.randn(num_patches, config.d_model), requires_grad=True)
+        elif config.positional_encoding_type == "sincos":
+            position_enc = torch.zeros(num_patches, config.d_model)
+            position = torch.arange(0, num_patches).unsqueeze(1)
+            div_term = torch.exp(torch.arange(0, config.d_model, 2) * -(math.log(10000.0) / config.d_model))
+            position_enc[:, 0::2] = torch.sin(position * div_term)
+            position_enc[:, 1::2] = torch.cos(position * div_term)
+            position_enc = position_enc - position_enc.mean()
+            position_enc = position_enc / (position_enc.std() * 10)
+            position_enc = nn.Parameter(position_enc, requires_grad=False)
+        else:
+            raise ValueError(
+                f"{config.positional_encoding_type} is not a valid positional encoder. Available types are 'random' and 'sincos'."
+            )
+        return position_enc
+
+    def forward(self, patch_input: torch.Tensor):
+        if self.use_cls_token:
+            # patch_input: [bs x num_channels x num_patches x d_model]
+            patch_input = self.positional_dropout(patch_input + self.position_enc[1:, :])
+            # append cls token where cls_token: [1 x num_channels x 1 x d_model]
+            cls_token = self.cls_token + self.position_enc[:1, :]
+            # get the same copy of cls_token for all the samples in batch: [bs x num_channels x 1 x d_model]
+            cls_tokens = cls_token.expand(patch_input.shape[0], self.num_input_channels, -1, -1)
+            # hidden_state: [bs x num_channels x (num_patches+1) x d_model]
+            hidden_state = torch.cat((cls_tokens, patch_input), dim=2)
+        else:
+            # hidden_state: [bs x num_channels x num_patches x d_model]
+            hidden_state = self.positional_dropout(patch_input + self.position_enc)
+        return hidden_state
+
+
+class PatchTSTEncoder(PatchTSTPreTrainedModel):
+    """
+    PatchTST Encoder
+    """
+
+    def __init__(self, config: PatchTSTConfig, num_patches: int):
+        super().__init__(config)
+        self.gradient_checkpointing = False
+
+        # Input embedding: projection of feature vectors onto a d-dim vector space
+        self.embedder = PatchTSTEmbedding(config)
+        # Positional encoding
+        self.positional_encoder = PatchTSTPositionalEncoding(config, num_patches)
+        # Encoder
+        self.layers = nn.ModuleList([PatchTSTEncoderLayer(config) for i in range(config.num_hidden_layers)])
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        patch_input: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+    ) -> BaseModelOutput:
+        """
+        Parameters:
+            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
+                Past values of the time series
+            output_hidden_states (bool, optional): Indicates if hidden states should be outputted.
+            output_attentions (bool, optional): Indicates if attentions should be outputted.
+
+        return:
+            `BaseModelOutput`
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # Input embedding
+        patch_input = self.embedder(patch_input)
+        # Positional encoding
+        hidden_state = self.positional_encoder(patch_input)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_state,)
+
+            layer_outputs = encoder_layer(hidden_state=hidden_state, output_attentions=output_attentions)
+            # get hidden state. hidden_state shape is [bs x num_channels x num_patches x d_model]
+            # or [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+            hidden_state = layer_outputs[0]
+            # append attention matrix at each layer
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        # return past_values, hidden_states
+        return BaseModelOutput(last_hidden_state=hidden_state, hidden_states=encoder_states, attentions=all_attentions)
+
+
+PATCHTST_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`PatchTSTConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@dataclass
+class PatchTSTModelOutput(ModelOutput):
+    """
+    Base class for model's outputs, with potential hidden states.
+
+    Parameters:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`, *optional*)
+            Bool masked tensor indicating which patches are masked
+        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
+            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
+        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
+            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
+        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
+            Patched input to the Transformer
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    mask: torch.FloatTensor = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
+    patch_input: torch.FloatTensor = None
+
+
+@dataclass
+class PatchTSTForPretrainingOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPretraining`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_output: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PatchTSTForRegressionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForRegression`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
+            Regression outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    regression_outputs: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class PatchTSTForPredictionOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForPrediction`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            MSE loss.
+        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, -1)`):
+            Prediction outputs of the time series modeling heads.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
+            Mean of the input data (batch_size, sequence_length, num_channels) over the sequence_length
+        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`, *optional*)
+            Std of the input data (batch_size, sequence_length, num_channels) over the sequence_length
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_outputs: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loc: torch.FloatTensor = None
+    scale: torch.FloatTensor = None
+
+
+@dataclass
+class PatchTSTForClassificationOutput(ModelOutput):
+    """
+    Output type of [`PatchTSTForClassification`].
+
+    Parameters:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
+            Prediction scores of the PatchTST modeling head (scores before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SamplePatchTSTOutput(ModelOutput):
+    """
+    Base class for time series model's predictions outputs that contains the sampled values from the chosen
+    distribution.
+
+    Parameters:
+        sequences `(batch_size, num_samples, prediction_length, num_targets)`):
+                Sampled values from the chosen distribution.
+    """
+
+    sequences: torch.FloatTensor = None
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll
+def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the negative log likelihood loss from input distribution with respect to target.
+    """
+    return -input.log_prob(target)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average
+def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor:
+    """
+    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
+    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.
+
+    Args:
+        input_tensor (`torch.FloatTensor`):
+            Input tensor, of which the average must be computed.
+        weights (`torch.FloatTensor`, *optional*):
+            Weights tensor, of the same shape as `input_tensor`.
+        dim (`int`, *optional*):
+            The dim along which to average `input_tensor`.
+
+    Returns:
+        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
+    """
+    if weights is not None:
+        weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor))
+        sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0)
+        return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights
+    else:
+        return input_tensor.mean(dim=dim)
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
+class PatchTSTStdScaler(nn.Module):
+    """
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
+        denominator = denominator.clamp_min(1.0)
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
+
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        scale = torch.sqrt(variance + self.minimum_scale)
+        return (data - loc) / scale, loc, scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
+class PatchTSTMeanScaler(nn.Module):
+    """
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
+    accordingly.
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
+        num_observed = observed_indicator.sum(self.dim, keepdim=True)
+
+        scale = ts_sum / torch.clamp(num_observed, min=1)
+
+        # If `default_scale` is provided, we use it, otherwise we use the scale
+        # of the batch.
+        if self.default_scale is None:
+            batch_sum = ts_sum.sum(dim=0)
+            batch_observations = torch.clamp(num_observed.sum(0), min=1)
+            default_scale = torch.squeeze(batch_sum / batch_observations)
+        else:
+            default_scale = self.default_scale * torch.ones_like(scale)
+
+        # apply default scale where there are no observations
+        scale = torch.where(num_observed > 0, scale, default_scale)
+
+        # ensure the scale is at least `self.minimum_scale`
+        scale = torch.clamp(scale, min=self.minimum_scale)
+        scaled_data = data / scale
+
+        if not self.keepdim:
+            scale = scale.squeeze(dim=self.dim)
+
+        return scaled_data, torch.zeros_like(scale), scale
+
+
+# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->PatchTST,TimeSeries->PatchTST
+class PatchTSTNOPScaler(nn.Module):
+    """
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
+        return data, loc, scale
+
+
+class PatchTSTScaler(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        if config.scaling == "mean" or config.scaling is True:
+            self.scaler = PatchTSTMeanScaler(config)
+        elif config.scaling == "std":
+            self.scaler = PatchTSTStdScaler(config)
+        else:
+            self.scaler = PatchTSTNOPScaler(config)
+
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Input for scaler calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, um_input_channels)`)
+        """
+        data, loc, scale = self.scaler(data, observed_indicator)
+        return data, loc, scale
+
+
+@add_start_docstrings(
+    "The bare PatchTST Model outputting raw hidden-states without any specific head.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTModel(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        self.scaler = PatchTSTScaler(config)
+        self.patchifier = PatchTSTPatchify(config)
+        self.do_mask_input = config.do_mask_input
+        # get num_patches information from PatchTSTPatchify
+        num_patches = self.patchifier.num_patches
+
+        if self.do_mask_input:
+            self.masking = PatchTSTMasking(config)
+        else:
+            self.masking = nn.Identity()
+        self.encoder = PatchTSTEncoder(config, num_patches=num_patches)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTModelOutput]:
+        r"""
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            future_values (`torch.BoolTensor` of shape `(batch_size, prediction_length, num_input_channels)`, *optional*):
+                Future target values associated with the `past_values`
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTModelOutput` or tuple of `torch.Tensor` (if `return_dict`=False or `config.return_dict`=False)
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import PatchTSTModel
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> model = PatchTSTModel.from_pretrained("namctin/patchtst_etth1_pretrain")
+
+        >>> # during training, one provides both past and future values
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     future_values=batch["future_values"],
+        ... )
+
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if past_observed_mask is None:
+            past_observed_mask = torch.ones_like(past_values)
+
+        # x: tensor [bs x sequence_length x num_input_channels]
+        scaled_past_values, loc, scale = self.scaler(past_values, past_observed_mask)
+
+        # patched_values: [bs x num_input_channels x num_patches x patch_length] for pretrain
+        patched_values = self.patchifier(scaled_past_values)
+        if self.do_mask_input:
+            masked_values, mask = self.masking(patched_values)
+        else:
+            masked_values, mask = self.masking(patched_values), None
+
+        encoder_output = self.encoder(
+            patch_input=masked_values, output_hidden_states=output_hidden_states, output_attentions=output_attentions
+        )
+
+        if not return_dict:
+            outputs = (encoder_output.last_hidden_state, encoder_output.hidden_states, encoder_output.attentions)
+            outputs = outputs + (mask, loc, scale, patched_values)
+            return tuple(v for v in outputs if v is not None)
+
+        return PatchTSTModelOutput(
+            last_hidden_state=encoder_output.last_hidden_state,
+            hidden_states=encoder_output.hidden_states,
+            attentions=encoder_output.attentions,
+            mask=mask,
+            loc=loc,
+            scale=scale,
+            patch_input=patched_values,
+        )
+
+
+class PatchTSTMaskPretrainHead(nn.Module):
+    """
+    Pretraining head for mask modelling
+    """
+
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.dropout = nn.Dropout(config.dropout)
+        self.linear = nn.Linear(config.d_model, config.patch_length)
+        self.use_cls_token = config.use_cls_token
+
+    def forward(self, embedding: torch.Tensor) -> torch.Tensor:
+        """
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                            `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True
+
+        """
+        embedding = self.linear(self.dropout(embedding))  # [bs x num_channels x num_patches x patch_length]
+        if self.use_cls_token:
+            embedding = embedding[:, :, 1:, :]  # remove the first cls token
+        return embedding
+
+
+@add_start_docstrings(
+    "The PatchTST for pretrain model.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTForPretraining(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        config.do_mask_input = True
+        self.model = PatchTSTModel(config=config)
+        self.head = PatchTSTMaskPretrainHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTForPretrainingOutput]:
+        r"""
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*): Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForPretrainingOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import PatchTSTConfig, PatchTSTForPretraining
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> # Config for random mask pretraining
+        >>> config = PatchTSTConfig(
+        ...     num_input_channels=7,
+        ...     context_length=512,
+        ...     patch_length=12,
+        ...     stride=12,
+        ...     mask_type='random',
+        ...     random_mask_ratio=0.4,
+        ...     use_cls_token=True,
+        ... )
+        >>> # Config for forecast mask pretraining
+        >>> config = PatchTSTConfig(
+        ...     num_input_channels=7,
+        ...     context_length=512,
+        ...     patch_length=12,
+        ...     stride=12,
+        ...     mask_type='forecast',
+        ...     num_forecast_mask_patches=5,
+        ...     use_cls_token=True,
+        ... )
+        >>> model = PatchTSTForPretraining(config)
+
+        >>> # during training, one provides both past and future values
+        >>> outputs = model(past_values=batch["past_values"])
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # past_values: [bs x num_channels x num_patches x d_model] or
+        # [bs x num_channels x (num_patches+1) x d_model] if use cls_token
+        model_output = self.model(
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+
+        # last_hidden_state: [bs x num_channels x num_patches x patch_length] or
+        # [bs x num_channels x (num_patches+1) x patch_length] if use cls_token
+        x_hat = self.head(model_output.last_hidden_state)
+
+        # calculate masked_loss
+        loss = nn.MSELoss(reduction="none")
+        loss_val = loss(x_hat, model_output.patch_input)
+        masked_loss = (loss_val.mean(dim=-1) * model_output.mask).sum() / (model_output.mask.sum() + 1e-10)
+
+        encoder_states = model_output.hidden_states
+        if not return_dict:
+            outputs = (x_hat,) + model_output[1:-4]
+            outputs = (masked_loss,) + outputs if masked_loss is not None else outputs
+            return outputs
+        return PatchTSTForPretrainingOutput(
+            loss=masked_loss, prediction_output=x_hat, hidden_states=encoder_states, attentions=model_output.attentions
+        )
+
+
+class PatchTSTClassificationHead(nn.Module):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__()
+        self.use_cls_token = config.use_cls_token
+        self.pooling_type = config.pooling_type
+        self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+        self.linear = nn.Linear(config.num_input_channels * config.d_model, config.num_targets)
+
+    def forward(self, embedding: torch.Tensor):
+        """
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, num_targets)`
+
+        """
+        if self.use_cls_token:
+            # use the first output token, pooled_embedding: bs x num_channels x d_model
+            pooled_embedding = embedding[:, :, 0, :]
+        elif self.pooling_type == "mean":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.mean(dim=2)
+        elif self.pooling_type == "max":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.max(dim=2)
+        else:
+            raise ValueError(f"pooling operator {self.pooling_type} is not implemented yet")
+        # pooled_embedding: bs x num_channels * d_model
+        pooled_embedding = self.flatten(pooled_embedding)
+        # output: bs x n_classes
+        output = self.linear(self.dropout(pooled_embedding))
+        return output
+
+
+@add_start_docstrings(
+    "The PatchTST for classification model.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTForClassification(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        # Turn off masking
+        if config.do_mask_input:
+            logger.warning("Setting `do_mask_input` parameter to False.")
+            config.do_mask_input = False
+
+        self.model = PatchTSTModel(config)
+        self.head = PatchTSTClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        target_values: torch.Tensor = None,
+        past_observed_mask: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, PatchTSTForClassificationOutput]:
+        r"""
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            target_values (`torch.Tensor`, *optional*):
+                Labels associates with the `past_values`
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForClassificationOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
+
+        Examples:
+
+        ```python
+        >>> from transformers import PatchTSTConfig, PatchTSTForClassification
+
+        >>> # classification task with two input channel2 and 3 classes
+        >>> config = PatchTSTConfig(
+        ...     num_input_channels=2,
+        ...     num_targets=3,
+        ...     context_length=512,
+        ...     patch_length=12,
+        ...     stride=12,
+        ...     use_cls_token=True,
+        ... )
+        >>> model = PatchTSTForClassification(config=config)
+
+        >>> # during inference, one only provides past values
+        >>> past_values = torch.randn(20, 512, 2)
+        >>> outputs = model(past_values=past_values)
+        >>> labels = outputs.prediction_logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_output = self.model(
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        y_hat = self.head(model_output.last_hidden_state)
+
+        loss_val = None
+        if target_values is not None:
+            loss = nn.CrossEntropyLoss()
+            loss_val = loss(y_hat, target_values)
+
+        if not return_dict:
+            outputs = (y_hat,) + model_output[1:-3]
+            outputs = (loss_val,) + outputs if loss_val is not None else outputs
+            return outputs
+        return PatchTSTForClassificationOutput(
+            loss=loss_val,
+            prediction_logits=y_hat,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The PatchTST for regression Model.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTPredictionHead(nn.Module):
+    def __init__(self, config: PatchTSTConfig, num_patches, distribution_output=None):
+        super().__init__()
+
+        self.share_projection = config.share_projection
+        self.num_input_channels = config.num_input_channels
+        self.use_cls_token = config.use_cls_token
+        self.pooling_type = config.pooling_type
+        if self.pooling_type or self.use_cls_token:
+            head_dim = config.d_model
+        else:
+            head_dim = config.d_model * num_patches
+
+        if not self.share_projection:
+            # if each channel has its own head
+            self.projections = nn.ModuleList()
+            self.dropouts = nn.ModuleList()
+            self.flattens = nn.ModuleList()
+            for i in range(self.num_input_channels):
+                self.flattens.append(nn.Flatten(start_dim=2))
+                if distribution_output is None:
+                    # use linear head
+                    self.projections.append(nn.Linear(head_dim, config.prediction_length))
+                else:
+                    # use distribution head
+                    self.projections.append(distribution_output.get_parameter_projection(head_dim))
+                self.dropouts.append(nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity())
+        else:
+            # all the channels share the same head
+            self.flatten = nn.Flatten(start_dim=2)
+            if distribution_output is None:
+                # use linear head
+                self.projection = nn.Linear(head_dim, config.prediction_length)
+            else:
+                # use distribution head
+                self.projection = distribution_output.get_parameter_projection(head_dim)
+            self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+
+    def forward(self, embedding: torch.Tensor):
+        """
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                     `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, forecast_len, num_channels)`
+
+        """
+        if self.use_cls_token:
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding[:, :, 0, :]
+        else:
+            if self.pooling_type == "mean":
+                # pooled_embedding: [bs x num_channels x d_model]
+                pooled_embedding = embedding.mean(dim=2)
+            elif self.pooling_type == "max":
+                # pooled_embedding: [bs x num_channels x d_model]
+                pooled_embedding = embedding.max(dim=2)
+            else:
+                # pooled_embedding: [bs x num_channels x num_patches x d_model]
+                pooled_embedding = embedding
+
+        if not self.share_projection:
+            output = []
+            for i in range(self.num_input_channels):
+                # pooled_embedding: [bs x (d_model * num_patches)] or [bs x d_model)]
+                pooled_embedding = self.flattens[i](pooled_embedding[:, i, :])
+                pooled_embedding = self.dropouts[i](pooled_embedding)
+                # pooled_embedding: [bs x forecast_len]
+                #  or tuple ([bs x forecast_len], [bs x forecast_len]) if using distribution head
+                pooled_embedding = self.projections[i](pooled_embedding)
+                output.append(pooled_embedding)
+            # output: [bs x num_channels x forecast_len]
+            output = torch.stack(output, dim=1)
+        else:
+            # pooled_embedding: [bs x num_channels x (d_model * num_patches)] or [bs x num_channels x d_model)]
+            pooled_embedding = self.flatten(pooled_embedding)
+            pooled_embedding = self.dropout(pooled_embedding)
+            # output: [bs x num_channels x forecast_len] or
+            # tuple ([bs x num_channels x forecast_len], [bs x num_channels x forecast_len]) if using distribution head
+            output = self.projection(pooled_embedding)
+
+        if isinstance(output, tuple):
+            # output: ([bs x forecast_len x num_channels], [bs x forecast_len x num_channels])
+            output = tuple(z.transpose(2, 1) for z in output)
+        else:
+            output = output.transpose(2, 1)  # [bs x forecast_len x num_channels]
+        return output
+
+
+@add_start_docstrings(
+    "The PatchTST for prediction model.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTForPrediction(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        # Turn off masking
+        if config.do_mask_input:
+            logger.warning("Setting `do_mask_input` parameter to False.")
+            config.do_mask_input = False
+
+        self.model = PatchTSTModel(config)
+
+        if config.loss == "mse":
+            self.distribution_output = None
+        else:
+            if config.distribution_output == "student_t":
+                self.distribution_output = StudentTOutput(dim=config.prediction_length)
+            elif config.distribution_output == "normal":
+                self.distribution_output = NormalOutput(dim=config.prediction_length)
+            elif config.distribution_output == "negative_binomial":
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.head = PatchTSTPredictionHead(
+            config, self.model.patchifier.num_patches, distribution_output=self.distribution_output
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        future_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, PatchTSTForPredictionOutput]:
+        r"""
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            future_values (`torch.Tensor` of shape `(bs, forecast_len, num_input_channels)`, *optional*):
+                Future target values associated with the `past_values`
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForPredictionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
+
+        Examples:
+
+        ```python
+        >>> from huggingface_hub import hf_hub_download
+        >>> import torch
+        >>> from transformers import PatchTSTConfig, PatchTSTForPrediction
+
+        >>> file = hf_hub_download(
+        ...     repo_id="hf-internal-testing/etth1-hourly-batch", filename="train-batch.pt", repo_type="dataset"
+        ... )
+        >>> batch = torch.load(file)
+
+        >>> # Prediction task with 7 input channels and prediction length is 96
+        >>> model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast")
+
+        >>> # during training, one provides both past and future values
+        >>> outputs = model(
+        ...     past_values=batch["past_values"],
+        ...     future_values=batch["future_values"],
+        ... )
+
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # during inference, one only provides past values, the model outputs future values
+        >>> outputs = model(past_values=batch["past_values"])
+        >>> prediction_outputs = outputs.prediction_outputs
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # get model output
+        model_output = self.model(
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        # get output head
+        y_hat = self.head(model_output.last_hidden_state)
+
+        loss_val = None
+
+        if self.distribution_output:
+            y_hat_out = y_hat
+        else:
+            y_hat_out = y_hat * model_output.scale + model_output.loc
+
+        if future_values is not None:
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(
+                    y_hat, loc=model_output.loc, scale=model_output.scale
+                )
+                loss_val = nll(distribution, future_values)
+                # take average of the loss
+                loss_val = weighted_average(loss_val)
+            else:
+                loss = nn.MSELoss(reduction="mean")
+                loss_val = loss(y_hat_out, future_values)
+
+        loc = model_output.loc
+        scale = model_output.scale
+
+        if not return_dict:
+            outputs = (y_hat_out,) + model_output[1:-1]
+            outputs = (loss_val,) + outputs if loss_val is not None else outputs
+            return outputs
+        return PatchTSTForPredictionOutput(
+            loss=loss_val,
+            prediction_outputs=y_hat_out,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+            loc=loc,
+            scale=scale,
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSTOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Past values of the time series that serves as context in order to predict the future.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        Return:
+            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, prediction_length, 1)` or `(batch_size, number of samples, prediction_length, num_input_channels)`
+            for multivariate predictions.
+        """
+        # get number of samples
+        num_parallel_samples = self.config.num_parallel_samples
+
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            future_values=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=False,
+        )
+        if self.distribution_output:
+            # get distribution
+            distribution = self.distribution_output.distribution(
+                outputs.prediction_outputs, loc=outputs.loc, scale=outputs.scale
+            )
+            # get samples: list of [bs x forecast_len x num_channels]
+            samples = [distribution.sample() for _ in range(num_parallel_samples)]
+            # samples: [bs x num_samples x forecast_len x num_channels]
+            samples = torch.stack(samples, dim=1)
+        else:
+            samples = outputs.prediction_outputs.unsqueeze(1)
+
+        return SamplePatchTSTOutput(sequences=samples)
+
+
+class PatchTSTRegressionHead(nn.Module):
+    """
+    Regression head
+    """
+
+    def __init__(self, config: PatchTSTConfig, distribution_output=None):
+        super().__init__()
+        self.y_range = config.output_range
+        self.use_cls_token = config.use_cls_token
+        self.pooling_type = config.pooling_type
+        self.distribution_output = distribution_output
+
+        head_dim = config.num_input_channels * config.d_model
+
+        self.flatten = nn.Flatten(start_dim=1)
+        self.dropout = nn.Dropout(config.head_dropout) if config.head_dropout > 0 else nn.Identity()
+
+        if distribution_output is None:
+            self.projection = nn.Linear(head_dim, config.num_targets)
+        else:
+            self.projection = distribution_output.get_parameter_projection(head_dim)
+
+    def forward(self, embedding: torch.Tensor):
+        """
+        Parameters:
+            embedding (`torch.Tensor` of shape `(bs, num_channels, num_patches, d_model)` or
+                    `(bs, num_channels, num_patches+1, d_model)` if `cls_token` is set to True, *required*):
+                Embedding from the model
+        Returns:
+            `torch.Tensor` of shape `(bs, output_dim)`
+
+        """
+        if self.use_cls_token:
+            # use the first output token, pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding[:, :, 0, :]
+        elif self.pooling_type == "mean":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.mean(dim=2)
+        elif self.pooling_type == "max":
+            # pooled_embedding: [bs x num_channels x d_model]
+            pooled_embedding = embedding.max(dim=2)
+        else:
+            raise ValueError(f"pooling operator {self.pooling_type} is not implemented yet")
+        # flatten the input
+        # pooled_embedding: bs x (num_channels * d_model)
+        pooled_embedding = self.dropout(self.flatten(pooled_embedding))
+        # projection
+        # output: bs x output_dim or a tuple of this shape for distribution head
+        output = self.projection(pooled_embedding)
+        # apply sigmoid to bound the output if required
+        if (self.distribution_output is None) & (self.y_range is not None):  # linear head
+            output = torch.sigmoid(output) * (self.y_range[1] - self.y_range[0]) + self.y_range[0]
+        return output
+
+
+@add_start_docstrings(
+    "The PatchTST for regression model.",
+    PATCHTST_START_DOCSTRING,
+)
+class PatchTSTForRegression(PatchTSTPreTrainedModel):
+    def __init__(self, config: PatchTSTConfig):
+        super().__init__(config)
+
+        # Turn off masking
+        if config.do_mask_input:
+            logger.warning("Setting `do_mask_input` parameter to False.")
+            config.do_mask_input = False
+
+        self.model = PatchTSTModel(config)
+        if config.loss == "mse":
+            self.distribution_output = None
+        else:
+            if config.distribution_output == "student_t":
+                self.distribution_output = StudentTOutput(dim=config.prediction_length * config.num_targets)
+            elif config.distribution_output == "normal":
+                self.distribution_output = NormalOutput(dim=config.prediction_length * config.num_targets)
+            elif config.distribution_output == "negative_binomial":
+                self.distribution_output = NegativeBinomialOutput(dim=config.prediction_length * config.num_targets)
+            else:
+                raise ValueError(f"Unknown distribution output {config.distribution_output}")
+
+        self.head = PatchTSTRegressionHead(config, self.distribution_output)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        past_values: torch.Tensor,
+        target_values: torch.Tensor = None,
+        past_observed_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, PatchTSTForRegressionOutput]:
+        r"""
+        Parameters:
+            past_values (`torch.Tensor` of shape `(bs, sequence_length, num_input_channels)`, *required*):
+                Input sequence to the model
+            target_values (`torch.Tensor` of shape `(bs, num_input_channels)`):
+                Target values associates with the `past_values`
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the output attention of all layers
+            return_dict (`bool`, *optional*):
+                Whether or not to return a `ModelOutput` instead of a plain tuple.
+
+        Returns:
+            `PatchTSTForRegressionOutput` or tuple of `torch.Tensor` (if `return_dict`=False or
+            `config.return_dict`=False)
+
+        Examples:
+
+        ```python
+        >>> from transformers import PatchTSTConfig, PatchTSTForRegression
+
+        >>> # Regression task with 6 input channels and regress 2 targets
+        >>> model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression")
+
+        >>> # during inference, one only provides past values, the model outputs future values
+        >>> past_values = torch.randn(20, 512, 6)
+        >>> outputs = model(past_values=past_values)
+        >>> regression_outputs = outputs.regression_outputs
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        model_output = self.model(
+            past_values=past_values,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=True,
+        )
+        # get output head. y_hat is of shape [bs x num_targets] or tuple of this shape
+        y_hat = self.head(model_output.last_hidden_state)
+
+        loss = None
+        if target_values is not None:
+            if self.distribution_output:
+                distribution = self.distribution_output.distribution(y_hat)
+                loss = nll(distribution, target_values)
+                # take average of the loss
+                loss = weighted_average(loss)
+            else:
+                loss = nn.MSELoss(reduction="mean")
+                loss = loss(y_hat, target_values)
+
+        if not return_dict:
+            outputs = (y_hat,) + model_output[1:-3]
+            outputs = (loss,) + outputs if loss is not None else outputs
+            return outputs
+        return PatchTSTForRegressionOutput(
+            loss=loss,
+            regression_outputs=y_hat,
+            hidden_states=model_output.hidden_states,
+            attentions=model_output.attentions,
+        )
+
+    def generate(
+        self,
+        past_values: torch.Tensor,
+        past_observed_mask: Optional[torch.Tensor] = None,
+    ) -> SamplePatchTSTOutput:
+        """
+        Generate sequences of sample predictions from a model with a probability distribution head.
+
+        Parameters:
+            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Past values of the time series that serves as context in order to predict the future.
+            past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
+                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
+                in `[0, 1]`:
+
+                - 1 for values that are **observed**,
+                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
+
+        Return:
+            [`SamplePatchTSTOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of
+            samples, num_targets)`.
+        """
+        # get number of samples
+        num_parallel_samples = self.config.num_parallel_samples
+
+        # get model output
+        outputs = self(
+            past_values=past_values,
+            target_values=None,
+            past_observed_mask=past_observed_mask,
+            output_hidden_states=False,
+        )
+
+        # get distribution
+        distribution = self.distribution_output.distribution(outputs.regression_outputs)
+        # get samples: list of [bs x num_targets]
+        samples = [distribution.sample() for _ in range(num_parallel_samples)]
+        # samples: [bs x num_samples x num_targets]
+        samples = torch.stack(samples, dim=1)
+        return SamplePatchTSTOutput(sequences=samples)
diff --git a/src/transformers/models/pegasus/modeling_pegasus.py b/src/transformers/models/pegasus/modeling_pegasus.py
index 18af4d518a899b..1a75c43e58e0ee 100755
--- a/src/transformers/models/pegasus/modeling_pegasus.py
+++ b/src/transformers/models/pegasus/modeling_pegasus.py
@@ -267,7 +267,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-PEGASUS_ATTENTION_CLASSES = {"default": PegasusAttention}
+PEGASUS_ATTENTION_CLASSES = {"eager": PegasusAttention}
 
 
 # Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Pegasus, MBART->PEGASUS
@@ -275,9 +275,8 @@ class PegasusEncoderLayer(nn.Module):
     def __init__(self, config: PegasusConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = PEGASUS_ATTENTION_CLASSES[attn_type](
+        self.self_attn = PEGASUS_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -347,9 +346,8 @@ class PegasusDecoderLayer(nn.Module):
     def __init__(self, config: PegasusConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = PEGASUS_ATTENTION_CLASSES[attn_type](
+        self.self_attn = PEGASUS_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -362,7 +360,7 @@ def __init__(self, config: PegasusConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = PEGASUS_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = PEGASUS_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/pegasus/tokenization_pegasus_fast.py b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
index 3faeccd2500cc8..3bc1726876e819 100644
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -145,6 +145,8 @@ def __init__(
         from_slow = kwargs.pop("from_slow", None)
         from_slow = from_slow or str(pad_token) != "<pad>" or str(eos_token) != "</s>" or str(unk_token) != "<unk>"
 
+        kwargs.pop("added_tokens_decoder", {})
+
         super().__init__(
             vocab_file,
             tokenizer_file=tokenizer_file,
diff --git a/src/transformers/models/persimmon/modeling_persimmon.py b/src/transformers/models/persimmon/modeling_persimmon.py
index f1b7ae320539c1..17163dcd8edf9b 100644
--- a/src/transformers/models/persimmon/modeling_persimmon.py
+++ b/src/transformers/models/persimmon/modeling_persimmon.py
@@ -27,6 +27,7 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
 from ...modeling_utils import PreTrainedModel
@@ -59,7 +60,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -89,7 +90,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -116,7 +117,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -178,15 +179,24 @@ def forward(self, hidden_states):
 class PersimmonAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: PersimmonConfig):
+    def __init__(self, config: PersimmonConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -256,7 +266,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -279,7 +289,13 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         # Partial rotary embedding
@@ -299,11 +315,9 @@ def forward(
         key_states = torch.cat((key_rot, key_pass), dim=-1)
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            # Specific to RoPE models with partial rotation
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
@@ -344,10 +358,10 @@ def forward(
 
 
 class PersimmonDecoderLayer(nn.Module):
-    def __init__(self, config: PersimmonConfig):
+    def __init__(self, config: PersimmonConfig, layer_idx: int):
         super().__init__()
         self.hidden_size = config.hidden_size
-        self.self_attn = PersimmonAttention(config=config)
+        self.self_attn = PersimmonAttention(config=config, layer_idx=layer_idx)
         self.mlp = PersimmonMLP(config)
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
@@ -443,6 +457,7 @@ class PersimmonPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["PersimmonDecoderLayer"]
     _skip_keys_device_placement = "past_key_values"
+    _supports_cache_class = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -491,17 +506,23 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -538,7 +559,9 @@ def __init__(self, config: PersimmonConfig):
         self.vocab_size = config.vocab_size
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
-        self.layers = nn.ModuleList([PersimmonDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList(
+            [PersimmonDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
         self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
         self.gradient_checkpointing = False
@@ -585,8 +608,11 @@ def forward(
         seq_length_with_past = seq_length
         past_key_values_length = 0
 
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
             seq_length_with_past = seq_length_with_past + past_key_values_length
 
         if position_ids is None:
@@ -619,21 +645,19 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        next_decoder_cache = None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
                     attention_mask,
                     position_ids,
-                    past_key_value,
+                    past_key_values,
                     output_attentions,
                 )
             else:
@@ -641,7 +665,7 @@ def forward(
                     hidden_states,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                 )
@@ -649,7 +673,7 @@ def forward(
             hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -660,7 +684,10 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
+
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
@@ -801,16 +828,33 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
             else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
diff --git a/src/transformers/models/phi/modeling_phi.py b/src/transformers/models/phi/modeling_phi.py
index 7f0f8caecb0a4a..c73d5b942e6d4f 100644
--- a/src/transformers/models/phi/modeling_phi.py
+++ b/src/transformers/models/phi/modeling_phi.py
@@ -20,11 +20,13 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
 from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
 from ...modeling_outputs import (
     BaseModelOutputWithPast,
@@ -37,12 +39,19 @@
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
 from .configuration_phi import PhiConfig
 
 
+if is_flash_attn_2_available():
+    from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+
 logger = logging.get_logger(__name__)
 
 _CHECKPOINT_FOR_DOC = "susnato/phi-1_dev"
@@ -55,6 +64,19 @@
 ]
 
 
+# Copied from transformers.models.llama.modeling_llama._get_unpad_data
+def _get_unpad_data(attention_mask):
+    seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
+    indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
+    max_seqlen_in_batch = seqlens_in_batch.max().item()
+    cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.torch.int32), (1, 0))
+    return (
+        indices,
+        cu_seqlens,
+        max_seqlen_in_batch,
+    )
+
+
 # Copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Phi
 class PhiRotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
@@ -75,7 +97,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         self.max_seq_len_cached = seq_len
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -105,7 +127,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
         t = t / self.scaling_factor
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -132,7 +154,7 @@ def _set_cos_sin_cache(self, seq_len, device, dtype):
 
         t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype)
 
-        freqs = torch.einsum("i,j->ij", t, self.inv_freq)
+        freqs = torch.outer(t, self.inv_freq)
         # Different from paper, but it uses a different permutation in order to obtain the same calculation
         emb = torch.cat((freqs, freqs), dim=-1)
         self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
@@ -196,15 +218,24 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
 class PhiAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-    def __init__(self, config: PhiConfig):
+    def __init__(self, config: PhiConfig, layer_idx: Optional[int] = None):
         super().__init__()
         self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing `layer_idx` is not recommended and will "
+                "to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
         self.max_position_embeddings = config.max_position_embeddings
         self.rope_theta = config.rope_theta
         self.partial_rotary_factor = config.partial_rotary_factor
+        self.is_causal = True
 
         if (self.head_dim * self.num_heads) != self.hidden_size:
             raise ValueError(
@@ -274,7 +305,7 @@ def forward(
         hidden_states: torch.Tensor,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        past_key_value: Optional[Cache] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -297,7 +328,13 @@ def forward(
 
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
-            kv_seq_len += past_key_value[0].shape[-2]
+            if self.layer_idx is None:
+                raise ValueError(
+                    f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
+                    "for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
+                    "with a layer index."
+                )
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
 
         # Partial rotary embedding
@@ -317,11 +354,9 @@ def forward(
         key_states = torch.cat((key_rot, key_pass), dim=-1)
 
         if past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-        past_key_value = (key_states, value_states) if use_cache else None
+            # Specific to RoPE models with partial rotation
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
 
         attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
 
@@ -361,10 +396,232 @@ def forward(
         return attn_output, attn_weights, past_key_value
 
 
+class PhiFlashAttention2(PhiAttention):
+    """
+    Phi flash attention module. This module inherits from `PhiAttention` as the weights of the module stays untouched.
+    The only required change would be on the forward pass where it needs to correctly call the public API of flash
+    attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # PhiFlashAttention2 attention does not support output_attentions
+
+        output_attentions = False
+
+        bsz, q_len, _ = hidden_states.size()
+
+        # [batch_size, seq_length, 3 x hidden_size]
+        fused_qkv = self.query_key_value(hidden_states)
+
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_states, key_states, value_states) = self._split_heads(fused_qkv)
+
+        if self.qk_layernorm:
+            query_states = self.q_layernorm(query_states)
+            key_states = self.k_layernorm(key_states)
+
+        # [batch_size, num_heads, seq_length, head_dim] -> [batch_size, seq_length, num_heads, head_dim]
+        query_states = query_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_value is not None:
+            kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
+        # Partial rotary embedding
+        query_rot, query_pass = (
+            query_states[..., : self.rotary_emb.dim],
+            query_states[..., self.rotary_emb.dim :],
+        )
+        key_rot, key_pass = (
+            key_states[..., : self.rotary_emb.dim],
+            key_states[..., self.rotary_emb.dim :],
+        )
+        # [batch_size, seq_length, num_heads, head_dim // config.partial_rotary_factor]
+        query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids)
+
+        # [batch_size, seq_length, num_heads, head_dim]
+        query_states = torch.cat((query_rot, query_pass), dim=-1)
+        key_states = torch.cat((key_rot, key_pass), dim=-1)
+
+        if past_key_value is not None:
+            cache_kwargs = {"sin": sin, "cos": cos, "partial_rotation_size": self.rotary_emb.dim}
+            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        tgt_len = key_states.shape[2]
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        query_states = query_states.transpose(1, 2).view(bsz, q_len, self.num_heads, self.head_dim)
+        key_states = key_states.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+        value_states = value_states.transpose(1, 2).view(bsz, tgt_len, self.num_heads, self.head_dim)
+
+        attn_dropout = self.config.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32.
+
+        if query_states.dtype == torch.float32:
+            # Handle the case where the model is quantized
+            if hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        attn_output = self._flash_attention_forward(
+            query_states, key_states, value_states, attention_mask, q_len, dropout=attn_dropout, softmax_scale=1.0
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim)
+        attn_output = self.dense(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_value
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward
+    def _flash_attention_forward(
+        self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
+    ):
+        """
+        Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
+        first unpad the input, then computes the attention scores and pad the final attention scores.
+
+        Args:
+            query_states (`torch.Tensor`):
+                Input query states to be passed to Flash Attention API
+            key_states (`torch.Tensor`):
+                Input key states to be passed to Flash Attention API
+            value_states (`torch.Tensor`):
+                Input value states to be passed to Flash Attention API
+            attention_mask (`torch.Tensor`):
+                The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
+                position of padding tokens and 1 for the position of non-padding tokens.
+            dropout (`int`, *optional*):
+                Attention dropout
+            softmax_scale (`float`, *optional*):
+                The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
+        """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
+        # Contains at least one padding token in the sequence
+        if attention_mask is not None:
+            batch_size = query_states.shape[0]
+            query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
+                query_states, key_states, value_states, attention_mask, query_length
+            )
+
+            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
+            max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
+
+            attn_output_unpad = flash_attn_varlen_func(
+                query_states,
+                key_states,
+                value_states,
+                cu_seqlens_q=cu_seqlens_q,
+                cu_seqlens_k=cu_seqlens_k,
+                max_seqlen_q=max_seqlen_in_batch_q,
+                max_seqlen_k=max_seqlen_in_batch_k,
+                dropout_p=dropout,
+                softmax_scale=softmax_scale,
+                causal=causal,
+            )
+
+            attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
+        else:
+            attn_output = flash_attn_func(
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
+            )
+
+        return attn_output
+
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input
+    def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
+        indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
+        batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
+
+        key_layer = index_first_axis(
+            key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        value_layer = index_first_axis(
+            value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
+        )
+        if query_length == kv_seq_len:
+            query_layer = index_first_axis(
+                query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
+            )
+            cu_seqlens_q = cu_seqlens_k
+            max_seqlen_in_batch_q = max_seqlen_in_batch_k
+            indices_q = indices_k
+        elif query_length == 1:
+            max_seqlen_in_batch_q = 1
+            cu_seqlens_q = torch.arange(
+                batch_size + 1, dtype=torch.int32, device=query_layer.device
+            )  # There is a memcpy here, that is very bad.
+            indices_q = cu_seqlens_q[:-1]
+            query_layer = query_layer.squeeze(1)
+        else:
+            # The -q_len: slice assumes left padding.
+            attention_mask = attention_mask[:, -query_length:]
+            query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask)
+
+        return (
+            query_layer,
+            key_layer,
+            value_layer,
+            indices_q,
+            (cu_seqlens_q, cu_seqlens_k),
+            (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
+        )
+
+
+PHI_ATTENTION_CLASSES = {
+    "eager": PhiAttention,
+    "flash_attention_2": PhiFlashAttention2,
+}
+
+
 class PhiDecoderLayer(nn.Module):
-    def __init__(self, config: PhiConfig):
+    def __init__(self, config: PhiConfig, layer_idx: int):
         super().__init__()
-        self.self_attn = PhiAttention(config=config)
+        self.self_attn = PHI_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
         self.mlp = PhiMLP(config)
         self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
         self.resid_dropout = nn.Dropout(config.resid_pdrop)
@@ -450,6 +707,8 @@ class PhiPreTrainedModel(PreTrainedModel):
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
     _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_cache_class = True
 
     def _init_weights(self, module):
         std = self.config.initializer_range
@@ -498,17 +757,23 @@ def _init_weights(self, module):
             config.n_positions - 1]`.
 
             [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
+            Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
+            returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
+
+            Two formats are allowed:
+            - a [`~cache_utils.Cache`] instance;
+            - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+            shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
+            cache format.
+
+            The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
+            legacy cache format will be returned.
+
+            If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
+            have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
+            of shape `(batch_size, sequence_length)`.
         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
@@ -546,8 +811,11 @@ def __init__(self, config: PhiConfig):
 
         self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
         self.embed_dropout = nn.Dropout(config.embd_pdrop)
-        self.layers = nn.ModuleList([PhiDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList(
+            [PhiDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
         self.final_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
 
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -590,12 +858,13 @@ def forward(
         else:
             raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
 
-        seq_length_with_past = seq_length
         past_key_values_length = 0
 
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
+        if use_cache:
+            use_legacy_cache = not isinstance(past_key_values, Cache)
+            if use_legacy_cache:
+                past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+            past_key_values_length = past_key_values.get_usable_length(seq_length)
 
         if position_ids is None:
             device = input_ids.device if input_ids is not None else inputs_embeds.device
@@ -609,14 +878,15 @@ def forward(
 
         inputs_embeds = self.embed_dropout(inputs_embeds)
 
-        # embed positions
-        if attention_mask is None:
-            attention_mask = torch.ones(
-                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+        # Attention mask.
+        if self._use_flash_attention_2:
+            # 2d mask is passed through the layers
+            attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        else:
+            # 4d mask is passed through the layers
+            attention_mask = _prepare_4d_causal_attention_mask(
+                attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
             )
-        attention_mask = _prepare_4d_causal_attention_mask(
-            attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
-        )
 
         hidden_states = inputs_embeds
 
@@ -630,21 +900,19 @@ def forward(
         # decoder layers
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
-        next_decoder_cache = () if use_cache else None
+        next_decoder_cache = None
 
-        for idx, decoder_layer in enumerate(self.layers):
+        for decoder_layer in self.layers:
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)
 
-            past_key_value = past_key_values[idx] if past_key_values is not None else None
-
             if self.gradient_checkpointing and self.training:
                 layer_outputs = self._gradient_checkpointing_func(
                     decoder_layer.__call__,
                     hidden_states,
                     attention_mask,
                     position_ids,
-                    past_key_value,
+                    past_key_values,
                     output_attentions,
                 )
             else:
@@ -652,7 +920,7 @@ def forward(
                     hidden_states,
                     attention_mask=attention_mask,
                     position_ids=position_ids,
-                    past_key_value=past_key_value,
+                    past_key_value=past_key_values,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
                 )
@@ -660,7 +928,7 @@ def forward(
             hidden_states = layer_outputs[0]
 
             if use_cache:
-                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+                next_decoder_cache = layer_outputs[2 if output_attentions else 1]
 
             if output_attentions:
                 all_self_attns += (layer_outputs[1],)
@@ -671,7 +939,9 @@ def forward(
         if output_hidden_states:
             all_hidden_states += (hidden_states,)
 
-        next_cache = next_decoder_cache if use_cache else None
+        next_cache = None
+        if use_cache:
+            next_cache = next_decoder_cache.to_legacy_cache() if use_legacy_cache else next_decoder_cache
         if not return_dict:
             return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
         return BaseModelOutputWithPast(
@@ -813,16 +1083,33 @@ def prepare_inputs_for_generation(
         self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
     ):
         if past_key_values is not None:
-            past_length = past_key_values[0][0].shape[2]
-
-            # Some generation methods already pass only the last input ID
-            if input_ids.shape[1] > past_length:
-                remove_prefix_length = past_length
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+                max_cache_length = past_key_values.get_max_length()
             else:
-                # Default to old behavior: keep only final ID
-                remove_prefix_length = input_ids.shape[1] - 1
-
-            input_ids = input_ids[:, remove_prefix_length:]
+                cache_length = past_length = past_key_values[0][0].shape[2]
+                max_cache_length = None
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+
+            # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
+            if (
+                max_cache_length is not None
+                and attention_mask is not None
+                and cache_length + input_ids.shape[1] > max_cache_length
+            ):
+                attention_mask = attention_mask[:, -max_cache_length:]
 
         position_ids = kwargs.get("position_ids", None)
         if attention_mask is not None and position_ids is None:
diff --git a/src/transformers/models/plbart/modeling_plbart.py b/src/transformers/models/plbart/modeling_plbart.py
index ad298c6d389048..f03f90183a59a5 100644
--- a/src/transformers/models/plbart/modeling_plbart.py
+++ b/src/transformers/models/plbart/modeling_plbart.py
@@ -23,7 +23,12 @@
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 
 from ...activations import ACT2FN
-from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+    _prepare_4d_causal_attention_mask,
+    _prepare_4d_causal_attention_mask_for_sdpa,
+)
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -265,9 +270,8 @@ class PLBartEncoderLayer(nn.Module):
     def __init__(self, config: PLBartConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = PLBART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -332,7 +336,8 @@ def forward(
         return outputs
 
 
-PLBART_ATTENTION_CLASSES = {"default": PLBartAttention}
+# TODO: Implement attention with SDPA for PLBart.
+PLBART_ATTENTION_CLASSES = {"eager": PLBartAttention}
 
 
 # Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->PLBart, BART->PLBART
@@ -341,8 +346,7 @@ def __init__(self, config: PLBartConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.self_attn = PLBART_ATTENTION_CLASSES[attn_type](
+        self.self_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -355,7 +359,7 @@ def __init__(self, config: PLBartConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = PLBART_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = PLBART_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -670,6 +674,8 @@ def __init__(self, config: PLBartConfig, embed_tokens: Optional[nn.Embedding] =
             embed_dim,
         )
         self.layers = nn.ModuleList([PLBartEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
         self.layernorm_embedding = nn.LayerNorm(embed_dim)
 
         self.gradient_checkpointing = False
@@ -757,8 +763,13 @@ def forward(
 
         # expand attention_mask
         if attention_mask is not None:
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 attention_mask = attention_mask if 0 in attention_mask else None
+            elif self._use_sdpa and head_mask is None and not output_attentions:
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
@@ -846,6 +857,9 @@ def __init__(self, config: PLBartConfig, embed_tokens: Optional[nn.Embedding] =
             config.d_model,
         )
         self.layers = nn.ModuleList([PLBartDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
+
         self.layernorm_embedding = nn.LayerNorm(config.d_model)
 
         self.gradient_checkpointing = False
@@ -964,9 +978,18 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input) * self.embed_scale
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None:
+            # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+            # the manual implementation that requires a 4D causal mask in all cases.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask,
+                input_shape,
+                inputs_embeds,
+                past_key_values_length,
+            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
@@ -975,8 +998,17 @@ def forward(
 
         # expand encoder attention mask
         if encoder_hidden_states is not None and encoder_attention_mask is not None:
-            if getattr(self.config, "_flash_attn_2_enabled", False):
+            if self._use_flash_attention_2:
                 encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions:
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
             else:
                 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
                 encoder_attention_mask = _prepare_4d_attention_mask(
diff --git a/src/transformers/models/rag/retrieval_rag.py b/src/transformers/models/rag/retrieval_rag.py
index 88cb54115bf548..76f6231ec28fbb 100644
--- a/src/transformers/models/rag/retrieval_rag.py
+++ b/src/transformers/models/rag/retrieval_rag.py
@@ -23,7 +23,7 @@
 
 from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import BatchEncoding
-from ...utils import cached_file, is_datasets_available, is_faiss_available, logging, requires_backends
+from ...utils import cached_file, is_datasets_available, is_faiss_available, logging, requires_backends, strtobool
 from .configuration_rag import RagConfig
 from .tokenization_rag import RagTokenizer
 
@@ -131,6 +131,13 @@ def _resolve_path(self, index_path, filename):
     def _load_passages(self):
         logger.info(f"Loading passages from {self.index_path}")
         passages_path = self._resolve_path(self.index_path, self.PASSAGE_FILENAME)
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
+                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
+                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
+                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
+            )
         with open(passages_path, "rb") as passages_file:
             passages = pickle.load(passages_file)
         return passages
@@ -140,6 +147,13 @@ def _deserialize_index(self):
         resolved_index_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index.dpr")
         self.index = faiss.read_index(resolved_index_path)
         resolved_meta_path = self._resolve_path(self.index_path, self.INDEX_FILENAME + ".index_meta.dpr")
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
+                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
+                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
+                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
+            )
         with open(resolved_meta_path, "rb") as metadata_file:
             self.index_id_to_db_id = pickle.load(metadata_file)
         assert (
diff --git a/src/transformers/models/reformer/modeling_reformer.py b/src/transformers/models/reformer/modeling_reformer.py
index 275a1e1dc738b3..7096a57d0fa4ee 100755
--- a/src/transformers/models/reformer/modeling_reformer.py
+++ b/src/transformers/models/reformer/modeling_reformer.py
@@ -2139,7 +2139,7 @@ def _pad_to_mult_of_chunk_length(
         padded_seq_length=None,
         device=None,
     ):
-        logger.info(
+        logger.warning_once(
             f"Input ids are automatically padded from {input_shape[-1]} to {input_shape[-1] + padding_length} to be a "
             f"multiple of `config.chunk_length`: {padded_seq_length}"
         )
diff --git a/src/transformers/models/rembert/tokenization_rembert.py b/src/transformers/models/rembert/tokenization_rembert.py
index c1f12527ef5974..9403e911769184 100644
--- a/src/transformers/models/rembert/tokenization_rembert.py
+++ b/src/transformers/models/rembert/tokenization_rembert.py
@@ -21,7 +21,7 @@
 
 import sentencepiece as spm
 
-from ...tokenization_utils import PreTrainedTokenizer
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
 from ...utils import logging
 
 
@@ -111,6 +111,9 @@ def __init__(
         mask_token="[MASK]",
         **kwargs,
     ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
         self.do_lower_case = do_lower_case
         self.remove_space = remove_space
         self.keep_accents = keep_accents
diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
index c46b20ba715ec7..4986edc729417b 100755
--- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
+++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -159,25 +159,6 @@ class SeamlessM4TGenerationOutput(ModelOutput):
             If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
             information on the default strategy.
-        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
-        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
-            1]`:
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-
         encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
             Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
             `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
@@ -1090,10 +1071,10 @@ def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_
         return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
 
 
-# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->SeamlessM4T,key_value_states->encoder_hidden_states
 class SeamlessM4TAttention(nn.Module):
     """Multi-headed attention from 'Attention Is All You Need' paper"""
 
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4T
     def __init__(
         self,
         embed_dim: int,
@@ -1134,7 +1115,6 @@ def forward(
         encoder_hidden_states: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
         output_attentions: bool = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
         """Input shape: Batch x Time x Channel"""
@@ -1208,15 +1188,6 @@ def forward(
 
         attn_weights = nn.functional.softmax(attn_weights, dim=-1)
 
-        if layer_head_mask is not None:
-            if layer_head_mask.size() != (self.num_heads,):
-                raise ValueError(
-                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
-                    f" {layer_head_mask.size()}"
-                )
-            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
-            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
-
         if output_attentions:
             # this operation is a bit awkward, but it's required to
             # make sure that attn_weights keeps its gradient.
@@ -1298,7 +1269,6 @@ def forward(
         self,
         hidden_states: torch.Tensor,
         attention_mask: torch.Tensor,
-        layer_head_mask: torch.Tensor,
         output_attentions: bool = False,
     ) -> torch.Tensor:
         """
@@ -1308,15 +1278,12 @@ def forward(
             attention_mask (`torch.FloatTensor`):
                 attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                 large negative values.
-            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
-                `(encoder_attention_heads,)`.
         """
         residual = hidden_states
         hidden_states = self.self_attn_layer_norm(hidden_states)
         hidden_states, attn_weights, _ = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
         hidden_states = self.attn_dropout(hidden_states)
@@ -1375,8 +1342,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.Tensor] = None,
         encoder_attention_mask: Optional[torch.Tensor] = None,
-        layer_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: Optional[bool] = False,
         use_cache: Optional[bool] = True,
@@ -1393,10 +1358,6 @@ def forward(
             encoder_attention_mask (`torch.FloatTensor`):
                 encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
                 very large negative values.
-            layer_head_mask (`torch.FloatTensor`):
-                mask for attention heads in a given layer of size `(encoder_attention_heads,)`.
-            cross_attn_layer_head_mask (`torch.FloatTensor`):
-                mask for cross-attention heads in a given layer of size `(decoder_attention_heads,)`.
             past_key_value (`Tuple(torch.FloatTensor)`):
                 cached past key and value projection states
             output_attentions (`bool`, *optional*):
@@ -1414,7 +1375,6 @@ def forward(
             hidden_states=hidden_states,
             past_key_value=self_attn_past_key_value,
             attention_mask=attention_mask,
-            layer_head_mask=layer_head_mask,
             output_attentions=output_attentions,
         )
         hidden_states = self.attn_dropout(hidden_states)
@@ -1435,7 +1395,6 @@ def forward(
                 encoder_hidden_states=encoder_hidden_states,
                 past_key_value=cross_attn_past_key_value,
                 attention_mask=encoder_attention_mask,
-                layer_head_mask=cross_attn_layer_head_mask,
                 output_attentions=output_attentions,
             )
             hidden_states = self.attn_dropout(hidden_states)
@@ -1710,7 +1669,6 @@ def forward(
         self,
         input_ids: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
@@ -1734,12 +1692,6 @@ def forward(
                 - 0 for tokens that are **masked**.
 
                 [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
             inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                 Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                 This is useful if you want more control over how to convert `input_ids` indices into associated vectors
@@ -1796,13 +1748,6 @@ def forward(
         encoder_states = () if output_hidden_states else None
         all_attentions = () if output_attentions else None
 
-        # check if head_mask has a correct number of layers specified if desired
-        if head_mask is not None:
-            if head_mask.size()[0] != len(self.layers):
-                raise ValueError(
-                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
-                    f" {head_mask.size()[0]}."
-                )
         for idx, encoder_layer in enumerate(self.layers):
             if output_hidden_states:
                 encoder_states = encoder_states + (hidden_states,)
@@ -1821,14 +1766,12 @@ def forward(
                         encoder_layer.forward,
                         hidden_states,
                         attention_mask,
-                        (head_mask[idx] if head_mask is not None else None),
                         output_attentions,
                     )
                 else:
                     layer_outputs = encoder_layer(
                         hidden_states,
                         attention_mask,
-                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
                         output_attentions=output_attentions,
                     )
 
@@ -1912,8 +1855,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         encoder_hidden_states: Optional[torch.FloatTensor] = None,
         encoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         use_cache: Optional[bool] = None,
@@ -1949,19 +1890,6 @@ def forward(
                 - 0 for tokens that are **masked**.
 
                 [What are attention masks?](../glossary#attention-mask)
-            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
-            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
-                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
-                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
-
-                - 1 indicates the head is **not masked**,
-                - 0 indicates the head is **masked**.
-
             past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                 Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                 shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
@@ -2043,14 +1971,6 @@ def forward(
         all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
         next_decoder_cache = () if use_cache else None
 
-        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
-        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
-            if attn_mask is not None:
-                if attn_mask.size()[0] != len(self.layers):
-                    raise ValueError(
-                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
-                        f" {attn_mask.size()[0]}."
-                    )
         for idx, decoder_layer in enumerate(self.layers):
             # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
             if output_hidden_states:
@@ -2069,8 +1989,6 @@ def forward(
                     attention_mask,
                     encoder_hidden_states,
                     encoder_attention_mask,
-                    head_mask[idx] if head_mask is not None else None,
-                    cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
                     None,
                     output_attentions,
                     use_cache,
@@ -2081,10 +1999,6 @@ def forward(
                     attention_mask=attention_mask,
                     encoder_hidden_states=encoder_hidden_states,
                     encoder_attention_mask=encoder_attention_mask,
-                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
-                    cross_attn_layer_head_mask=(
-                        cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
-                    ),
                     past_key_value=past_key_value,
                     output_attentions=output_attentions,
                     use_cache=use_cache,
@@ -2143,16 +2057,12 @@ def __init__(
         # Initialize weights and apply final processing
         self.post_init()
 
-    # Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100Model.forward
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -2173,7 +2083,6 @@ def forward(
             encoder_outputs = self.encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -2193,8 +2102,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -2278,9 +2185,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -2308,9 +2212,6 @@ def forward(
             decoder_input_ids=decoder_input_ids,
             encoder_outputs=encoder_outputs,
             decoder_attention_mask=decoder_attention_mask,
-            head_mask=head_mask,
-            decoder_head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=inputs_embeds,
             decoder_inputs_embeds=decoder_inputs_embeds,
@@ -2348,9 +2249,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -2365,9 +2263,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
@@ -2798,9 +2693,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -2832,7 +2724,6 @@ def forward(
             encoder_outputs = self.text_encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -2854,8 +2745,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=encoder_attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -3007,9 +2896,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -3024,9 +2910,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
@@ -3095,9 +2978,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -3129,7 +3009,6 @@ def forward(
             encoder_outputs = self.speech_encoder(
                 input_features=input_features,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -3158,8 +3037,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=encoder_attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -3312,9 +3189,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -3329,9 +3203,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
@@ -3408,9 +3279,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -3447,7 +3315,6 @@ def forward(
             encoder_outputs = self.text_encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -3469,8 +3336,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=encoder_attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -3624,8 +3489,6 @@ def generate(
             input_ids=sequences,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=attention_mask,
-            head_mask=kwargs_text.get("decoder_head_mask"),
-            cross_attn_head_mask=kwargs_text.get("cross_attn_head_mask"),
         ).last_hidden_state
 
         pad_token_id = self.generation_config.pad_token_id
@@ -3678,9 +3541,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -3695,9 +3555,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
@@ -3769,9 +3626,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -3809,7 +3663,6 @@ def forward(
             encoder_outputs = self.speech_encoder(
                 input_features=input_features,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -3838,8 +3691,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=encoder_attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -3999,8 +3850,6 @@ def generate(
             input_ids=sequences,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=attention_mask,
-            head_mask=kwargs_text.get("decoder_head_mask"),
-            cross_attn_head_mask=kwargs_text.get("cross_attn_head_mask"),
         ).last_hidden_state
 
         pad_token_id = self.generation_config.pad_token_id
@@ -4063,9 +3912,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -4080,9 +3926,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
@@ -4167,9 +4010,6 @@ def forward(
         attention_mask: Optional[torch.Tensor] = None,
         decoder_input_ids: Optional[torch.LongTensor] = None,
         decoder_attention_mask: Optional[torch.LongTensor] = None,
-        head_mask: Optional[torch.Tensor] = None,
-        decoder_head_mask: Optional[torch.Tensor] = None,
-        cross_attn_head_mask: Optional[torch.Tensor] = None,
         encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
@@ -4242,7 +4082,6 @@ def forward(
             encoder_outputs = self.text_encoder(
                 input_ids=input_ids,
                 attention_mask=attention_mask,
-                head_mask=head_mask,
                 inputs_embeds=inputs_embeds,
                 output_attentions=output_attentions,
                 output_hidden_states=output_hidden_states,
@@ -4272,8 +4111,6 @@ def forward(
             attention_mask=decoder_attention_mask,
             encoder_hidden_states=encoder_outputs[0],
             encoder_attention_mask=encoder_attention_mask,
-            head_mask=decoder_head_mask,
-            cross_attn_head_mask=cross_attn_head_mask,
             past_key_values=past_key_values,
             inputs_embeds=decoder_inputs_embeds,
             use_cache=use_cache,
@@ -4477,8 +4314,6 @@ def generate(
             input_ids=sequences,
             encoder_hidden_states=encoder_hidden_states,
             encoder_attention_mask=attention_mask,
-            head_mask=kwargs_text.get("decoder_head_mask"),
-            cross_attn_head_mask=kwargs_text.get("cross_attn_head_mask"),
         ).last_hidden_state
 
         pad_token_id = self.generation_config.pad_token_id
@@ -4531,9 +4366,6 @@ def prepare_inputs_for_generation(
         decoder_input_ids,
         past_key_values=None,
         attention_mask=None,
-        head_mask=None,
-        decoder_head_mask=None,
-        cross_attn_head_mask=None,
         use_cache=None,
         encoder_outputs=None,
         **kwargs,
@@ -4548,9 +4380,6 @@ def prepare_inputs_for_generation(
             "past_key_values": past_key_values,
             "decoder_input_ids": decoder_input_ids,
             "attention_mask": attention_mask,
-            "head_mask": head_mask,
-            "decoder_head_mask": decoder_head_mask,
-            "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,
         }
 
diff --git a/src/transformers/models/seamless_m4t_v2/__init__.py b/src/transformers/models/seamless_m4t_v2/__init__.py
new file mode 100644
index 00000000000000..ebc4caef2da10a
--- /dev/null
+++ b/src/transformers/models/seamless_m4t_v2/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_seamless_m4t_v2": ["SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "SeamlessM4Tv2Config"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_seamless_m4t_v2"] = [
+        "SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "SeamlessM4Tv2ForTextToSpeech",
+        "SeamlessM4Tv2ForSpeechToSpeech",
+        "SeamlessM4Tv2ForTextToText",
+        "SeamlessM4Tv2ForSpeechToText",
+        "SeamlessM4Tv2Model",
+        "SeamlessM4Tv2PreTrainedModel",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_seamless_m4t_v2 import SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, SeamlessM4Tv2Config
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_seamless_m4t_v2 import (
+            SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+            SeamlessM4Tv2ForSpeechToSpeech,
+            SeamlessM4Tv2ForSpeechToText,
+            SeamlessM4Tv2ForTextToSpeech,
+            SeamlessM4Tv2ForTextToText,
+            SeamlessM4Tv2Model,
+            SeamlessM4Tv2PreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
new file mode 100644
index 00000000000000..734e341fb64044
--- /dev/null
+++ b/src/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
@@ -0,0 +1,426 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" SeamlessM4Tv2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SEAMLESS_M4T_V2_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "": "https://huggingface.co//resolve/main/config.json",
+}
+
+
+class SeamlessM4Tv2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~SeamlessM4Tv2Model`]. It is used to instantiate
+    an SeamlessM4Tv2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SeamlessM4Tv2
+    [""](https://huggingface.co/"") architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256102):
+            Vocabulary size of the text modality of the SeamlessM4Tv2 model. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForTextToSpeech`] or [`~SeamlessM4Tv2ForTextToText`].
+        t2u_vocab_size (`int`, *optional*, defaults to 10082):
+            Unit vocabulary size of the SeamlessM4Tv2 model. Defines the number of different "unit tokens" that can be
+            represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        char_vocab_size (`int`, *optional*, defaults to 10943):
+            Character vocabulary size of the SeamlessM4Tv2 model. Defines the number of different character tokens that
+            can be represented by the `char_inputs_ids` passed when calling the Text-To-Units sub-model of
+            [`~SeamlessM4Tv2Model`], [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+
+        > Parameters shared across sub-models
+
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" layers in the architecture.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
+            for more details.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder and feed-forward layers. If string,
+            `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all attention layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all activation layers in the model.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+
+        > Text encoder and text decoder specific parameters
+
+        encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text encoder.
+        decoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text decoder.
+        decoder_start_token_id (`int`, *optional*, defaults to 3):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only
+            applied in the text decoder.
+        max_new_tokens (`int`, *optional*, defaults to 256):
+            The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the _padding_ text token. Only applied to the text-decoder model.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            The id of the _end-of-stream_ text token. Only applied to the text-decoder model.
+
+        > Speech encoder specific parameters
+
+        speech_encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer speech encoder.
+        speech_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer speech encoder.
+        speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder.
+        speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        speech_encoder_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all layers in the speech encoder.
+        add_adapter (`bool`, *optional*, defaults to `True`):
+            Add an adapter layer on top of the speech encoder.
+        speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see
+            https://arxiv.org/abs/1909.11556) for more details.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing
+            input audios with [`SeamlessM4TFeatureExtractor`].
+        adaptor_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_stride (`int`, *optional*, defaults to 8):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all layers in the speech adapter.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
+            Can be specified to `relative_key`. If left to `None`, no relative position embedding is applied. Only
+            applied to the speech encoder. For more information on `"relative_key"`, please refer to [Self-Attention
+            with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder.
+        left_max_position_embeddings (`int`, *optional*, defaults to 64):
+            The left clipping value for relative positions.
+        right_max_position_embeddings (`int`, *optional*, defaults to 8):
+            The right clipping value for relative positions.
+        speech_encoder_chunk_size (`int`, *optional*, defaults to 20000): The size of each attention chunk.
+        speech_encoder_left_chunk_num (`int`, *optional*, defaults to 128):
+            Number of chunks on the left up to which lookahead is allowed.
+
+        > Text-To-Unit (t2u) model specific parameters
+
+        t2u_bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_encoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit encoder.
+        t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder.
+        t2u_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit encoder.
+        t2u_decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit decoder.
+        t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder.
+        t2u_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit decoder.
+        t2u_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text-to-unit component might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        t2u_variance_predictor_embed_dim (`int`, *optional*, defaults to 1024):
+            The projection dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_hidden_dim (`int`, *optional*, defaults to 256):
+            Internal dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers of the text-to-unit's duration predictor.
+        t2u_variance_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probabilitiy of the text-to-unit's duration predictor.
+
+         > Hifi-Gan Vocoder specific parameters
+
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only.
+        upsample_rates (`Tuple[int]` or `List[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network.
+            The length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*. Applies to the vocoder only.
+        upsample_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling
+            network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match
+            the length of *upsample_rates*. Applies to the vocoder only.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive
+            field fusion (MRF) module. Applies to the vocoder only.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in
+            the multi-receptive field fusion (MRF) module. Applies to the vocoder only.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder
+            only.
+        unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000):
+            Vocabulary size of the SeamlessM4Tv2 vocoder. Defines the number of different unit tokens that can be
+            represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        unit_embed_dim (`int`, *optional*, defaults to 1280):
+            The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only.
+        lang_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only.
+        spkr_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only.
+        vocoder_num_langs (`int`, *optional*, defaults to 36):
+            Number of langs supported by the vocoder. Might be different from `t2u_num_langs`.
+        vocoder_num_spkrs (`int`, *optional*, defaults to 200):
+            Number of speakers supported by the vocoder.
+        variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the duration predictor. Applies to the vocoder only.
+        var_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probabilitiy of the duration predictor. Applies to the vocoder only.
+        vocoder_offset (`int`, *optional*, defaults to 4):
+            Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only.
+
+    ```python
+    >>> from transformers import SeamlessM4Tv2Model, SeamlessM4Tv2Config
+
+    >>> # Initializing a SeamlessM4Tv2 "" style configuration
+    >>> configuration = SeamlessM4Tv2Config()
+
+    >>> # Initializing a model from the "" style configuration
+    >>> model = SeamlessM4Tv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "seamless_m4t_v2"
+
+    def __init__(
+        self,
+        vocab_size=256102,
+        t2u_vocab_size=10082,
+        char_vocab_size=10943,
+        # shared config
+        hidden_size=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        max_position_embeddings=4096,
+        is_encoder_decoder=True,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        activation_function="relu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        scale_embedding=True,
+        # text encoder|decoder
+        encoder_layers=24,
+        encoder_ffn_dim=8192,
+        encoder_attention_heads=16,
+        decoder_layers=24,
+        decoder_ffn_dim=8192,
+        decoder_attention_heads=16,
+        decoder_start_token_id=3,
+        max_new_tokens=256,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        # speech_encoder
+        speech_encoder_layers=24,
+        speech_encoder_attention_heads=16,
+        speech_encoder_intermediate_size=4096,
+        speech_encoder_hidden_act="swish",
+        speech_encoder_dropout=0.0,
+        add_adapter=True,
+        speech_encoder_layerdrop=0.1,
+        feature_projection_input_dim=160,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_dropout=0.1,
+        num_adapter_layers=1,
+        position_embeddings_type="relative_key",
+        conv_depthwise_kernel_size=31,
+        left_max_position_embeddings=64,
+        right_max_position_embeddings=8,
+        speech_encoder_chunk_size=20000,
+        speech_encoder_left_chunk_num=128,
+        # t2u config
+        t2u_bos_token_id=0,
+        t2u_pad_token_id=1,
+        t2u_eos_token_id=2,
+        t2u_encoder_layers=6,
+        t2u_encoder_ffn_dim=8192,
+        t2u_encoder_attention_heads=16,
+        t2u_decoder_layers=6,
+        t2u_decoder_ffn_dim=8192,
+        t2u_decoder_attention_heads=16,
+        t2u_max_position_embeddings=4096,
+        t2u_variance_predictor_embed_dim=1024,
+        t2u_variance_predictor_hidden_dim=256,
+        t2u_variance_predictor_kernel_size=3,
+        t2u_variance_pred_dropout=0.5,
+        # hifi-gan vocoder config
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[5, 4, 4, 2, 2],
+        upsample_kernel_sizes=[11, 8, 8, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        # specific to Code Hifi-Gan
+        unit_hifi_gan_vocab_size=10000,
+        unit_embed_dim=1280,
+        lang_embed_dim=256,
+        spkr_embed_dim=256,
+        vocoder_num_langs=36,
+        vocoder_num_spkrs=200,
+        variance_predictor_kernel_size=3,
+        var_pred_dropout=0.5,
+        vocoder_offset=4,
+        **kwargs,
+    ):
+        # overall_config
+        self.vocab_size = vocab_size
+        self.t2u_vocab_size = t2u_vocab_size
+        self.char_vocab_size = char_vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.max_new_tokens = max_new_tokens
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.scale_embedding = scale_embedding
+        # for proper config init
+        self.num_attention_heads = decoder_attention_heads
+        self.num_hidden_layers = decoder_layers
+
+        # text|unit encoder|decoder
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+
+        # speech_encoder
+        self.speech_encoder_layers = speech_encoder_layers
+        self.speech_encoder_hidden_act = speech_encoder_hidden_act
+        self.speech_encoder_dropout = speech_encoder_dropout
+        self.speech_encoder_attention_heads = speech_encoder_attention_heads
+        self.speech_encoder_layerdrop = speech_encoder_layerdrop
+        self.speech_encoder_intermediate_size = speech_encoder_intermediate_size
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.adaptor_kernel_size = adaptor_kernel_size
+        self.adaptor_stride = adaptor_stride
+        self.adaptor_dropout = adaptor_dropout
+        self.num_adapter_layers = num_adapter_layers
+        self.position_embeddings_type = position_embeddings_type
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.add_adapter = add_adapter
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+        self.speech_encoder_chunk_size = speech_encoder_chunk_size
+        self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num
+
+        # t2u config
+        self.t2u_bos_token_id = t2u_bos_token_id
+        self.t2u_pad_token_id = t2u_pad_token_id
+        self.t2u_eos_token_id = t2u_eos_token_id
+        self.t2u_encoder_layers = t2u_encoder_layers
+        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
+        self.t2u_encoder_attention_heads = t2u_encoder_attention_heads
+        self.t2u_decoder_layers = t2u_decoder_layers
+        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
+        self.t2u_decoder_attention_heads = t2u_decoder_attention_heads
+        self.t2u_max_position_embeddings = t2u_max_position_embeddings
+        self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size  # TODO: add to docstrings
+        self.t2u_variance_pred_dropout = t2u_variance_pred_dropout  # TODO: add to docstrings
+
+        # hifi-gan vocoder config
+        # original parameters specific to Hifi-Gan
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+
+        # specific to Code Hifi-Gan
+        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
+        self.unit_embed_dim = unit_embed_dim
+        self.lang_embed_dim = lang_embed_dim
+        self.spkr_embed_dim = spkr_embed_dim
+        self.vocoder_num_langs = vocoder_num_langs
+        self.vocoder_num_spkrs = vocoder_num_spkrs
+        self.variance_predictor_kernel_size = variance_predictor_kernel_size
+        self.var_pred_dropout = var_pred_dropout
+        self.vocoder_offset = vocoder_offset
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            max_position_embeddings=max_position_embeddings,
+            **kwargs,
+        )
diff --git a/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
new file mode 100644
index 00000000000000..8d4320cff5bd9b
--- /dev/null
+++ b/src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
@@ -0,0 +1,405 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Converting Meta SeamlessM4Tv2 checkpoints from seamless_communication to HF."""
+
+
+import argparse
+import os
+from pathlib import Path
+
+import torch
+from accelerate.utils.modeling import find_tied_parameters
+from seamless_communication.inference import Translator
+
+from transformers import (
+    SeamlessM4TFeatureExtractor,
+    SeamlessM4TProcessor,
+    SeamlessM4TTokenizer,
+    SeamlessM4Tv2Config,
+    SeamlessM4Tv2Model,
+)
+from transformers.utils import logging
+
+
+# fmt: off
+UNIT_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kan__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tam__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__", ]
+# fmt: on
+
+# fmt: off
+VOCODER_SUPPORTED_LANGUAGES = ["__arb__", "__ben__", "__cat__", "__ces__", "__cmn__", "__cym__", "__dan__", "__deu__", "__eng__", "__est__", "__fin__", "__fra__", "__hin__", "__ind__", "__ita__", "__jpn__", "__kor__", "__mlt__", "__nld__", "__pes__", "__pol__", "__por__", "__ron__", "__rus__", "__slk__", "__spa__", "__swe__", "__swh__", "__tel__", "__tgl__", "__tha__", "__tur__", "__ukr__", "__urd__", "__uzn__", "__vie__",]
+# fmt: on
+
+# fmt: off
+LARGE_SUPPORTED_LANGUAGES = ["afr","amh","arb","ary","arz","asm","azj","bel","ben","bos","bul","cat","ceb","ces","ckb","cmn","cmn_Hant","cym","dan","deu","ell","eng","est","eus","fin","fra","fuv","gaz","gle","glg","guj","heb","hin","hrv","hun","hye","ibo","ind","isl","ita","jav","jpn","kan","kat","kaz","khk","khm","kir","kor","lao","lit","lug","luo","lvs","mai","mal","mar","mkd","mlt","mni","mya","nld","nno","nob","npi","nya","ory","pan","pbt","pes","pol","por","ron","rus","sat","slk","slv","sna","snd","som","spa","srp","swe","swh","tam","tel","tgk","tgl","tha","tur","ukr","urd","uzn","vie","yor","yue","zlm","zul",]
+# fmt: on
+
+
+def assert_param_count(model_1, model_2):
+    count_1 = sum(p[1].numel() for p in model_1.named_parameters() if "final_proj" not in p[0])
+    count_2 = sum(p[1].numel() for p in model_2.named_parameters() if "final_proj" not in p[0])
+    assert count_1 == count_2, f"{model_1.__class__}: {count_1} != {model_2.__class__}: {count_2}"
+
+
+def param_count(model):
+    return sum(p[1].numel() for p in model.named_parameters() if "final_proj" not in p[0])
+
+
+def _grab_best_device(use_gpu=True):
+    if torch.cuda.device_count() > 0 and use_gpu:
+        device = "cuda"
+    else:
+        device = "cpu"
+    return torch.device(device)
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+vocoder_convert_list = [
+    ("ups", "hifi_gan.upsampler"),
+    ("conv_pre", "hifi_gan.conv_pre"),
+    ("resblocks", "hifi_gan.resblocks"),
+    ("conv_post", "hifi_gan.conv_post"),
+    ("lang", "language_embedding"),
+    ("spkr", "speaker_embedding"),
+    ("dict.", "unit_embedding."),
+    ("dur_predictor.conv1.0", "dur_predictor.conv1"),
+    ("dur_predictor.conv2.0", "dur_predictor.conv2"),
+]
+
+# order is important
+wav2vec_convert_list = [
+    ("speech_encoder_frontend.model_dim_proj", "feature_projection.projection"),
+    ("speech_encoder_frontend.post_extract_layer_norm", "feature_projection.layer_norm"),
+    ("speech_encoder_frontend.pos_encoder.conv", "encoder.pos_conv_embed.conv"),
+    ("speech_encoder.inner.layers", "encoder.layers"),
+    ("speech_encoder.inner_layer_norm", "encoder.layer_norm"),
+    ("speech_encoder.adaptor_layers", "adapter.layers"),
+    ("inner_proj", "intermediate_dense"),
+    ("self_attn.output_proj", "self_attn.linear_out"),
+    ("output_proj", "output_dense"),
+    ("self_attn.k_proj", "self_attn.linear_k"),
+    ("self_attn.v_proj", "self_attn.linear_v"),
+    ("self_attn.q_proj", "self_attn.linear_q"),
+    ("self_attn.sdpa.u_bias", "self_attn.pos_bias_u"),
+    ("self_attn.sdpa.v_bias", "self_attn.pos_bias_v"),
+    ("self_attn.sdpa.rel_k_embed", "self_attn.distance_embedding"),
+    ("self_attn.sdpa.r_proj", "self_attn.linear_pos"),
+    ("conv.pointwise_conv1", "conv_module.pointwise_conv1"),
+    ("conv.pointwise_conv2", "conv_module.pointwise_conv2"),
+    ("conv.depthwise_conv", "conv_module.depthwise_conv"),
+    ("conv.batch_norm", "conv_module.batch_norm"),
+    ("conv.layer_norm", "conv_module.depthwise_layer_norm"),
+    ("conv_layer_norm", "conv_module.layer_norm"),
+    ("speech_encoder.proj1", "intermediate_ffn.intermediate_dense"),
+    ("speech_encoder.proj2", "intermediate_ffn.output_dense"),
+    ("speech_encoder.layer_norm", "inner_layer_norm"),
+]
+
+t2u_convert_list = [
+    ("t2u_model.final_proj", "lm_head"),
+    ("t2u_model.", "model."),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("decoder_frontend.embed_char", "decoder.embed_char"),
+    ("decoder_frontend.pos_emb_alpha_char", "decoder.pos_emb_alpha_char"),
+    ("decoder_frontend.embed", "decoder.embed_tokens"),
+    ("decoder_frontend.pos_emb_alpha", "decoder.pos_emb_alpha"),
+    ("conv1d.conv", "conv"),
+    ("conv1d_layer_norm", "conv_layer_norm"),
+    ("decoder_frontend.variance_adaptor", "decoder"),
+    ("duration_predictor.conv1.0", "duration_predictor.conv1"),
+    ("duration_predictor.conv2.0", "duration_predictor.conv2"),
+]
+
+text_convert_list = [
+    ("text_encoder.", ""),
+    ("text_decoder.", ""),
+    ("text_encoder_frontend.embed", "embed_tokens"),
+    ("text_decoder_frontend.embed", "embed_tokens"),
+    ("encoder_decoder_attn_layer_norm", "cross_attention_layer_norm"),
+    ("encoder_decoder_attn", "cross_attention"),
+    ("linear_k", "k_proj"),
+    ("linear_v", "v_proj"),
+    ("linear_q", "q_proj"),
+    ("ffn.inner_proj", "ffn.fc1"),
+    ("ffn.output_proj", "ffn.fc2"),
+    ("output_proj", "out_proj"),
+    ("final_proj", "lm_head"),
+]
+
+CUR_PATH = os.path.dirname(os.path.abspath(__file__))
+default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache")
+CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "huggingface", "hub")
+
+
+def _load_hf_config():
+    return SeamlessM4Tv2Config()
+
+
+def _convert_model(
+    original_model,
+    hf_model,
+    convert_list,
+    device,
+    unwanted_prefix="model.",
+    filter_state_dict="speech",
+    exclude_state_dict=None,
+):
+    state_dict = original_model.state_dict()
+
+    # filter func
+    if isinstance(filter_state_dict, str):
+
+        def filter_func(x):
+            return filter_state_dict in x[0]
+
+    else:
+
+        def filter_func(item):
+            if exclude_state_dict is not None and exclude_state_dict in item[0]:
+                return False
+            for filter_el in filter_state_dict:
+                if filter_el in item[0]:
+                    return True
+
+            return False
+
+    state_dict = dict(filter(filter_func, state_dict.items()))
+
+    for k, v in list(state_dict.items()):
+        new_k = k[len(unwanted_prefix) :]
+        for old_layer_name, new_layer_name in convert_list:
+            if old_layer_name in new_k:
+                new_k = new_k.replace(old_layer_name, new_layer_name)
+
+        # must do it by hand
+        if ".layer_norm" in new_k and new_k.split(".layer_norm")[0][-1].isnumeric():
+            new_k = new_k.replace("layer_norm", "final_layer_norm")
+
+        state_dict[new_k] = state_dict.pop(k)
+
+    extra_keys = set(state_dict.keys()) - set(hf_model.state_dict().keys())
+    extra_keys = set(extra_keys)
+    missing_keys = set(hf_model.state_dict().keys()) - set(state_dict.keys())
+    missing_keys = set({k for k in missing_keys if "final_logits_bias" not in k})
+    if len(extra_keys) != 0:
+        raise ValueError(f"extra keys found: {extra_keys}")
+    if len(missing_keys) != 0:
+        raise ValueError(f"missing keys: {missing_keys}")
+    hf_model.load_state_dict(state_dict, strict=False)
+    n_params = param_count(hf_model)
+
+    logger.info(f"model loaded: {round(n_params/1e6,1)}M params")
+
+    hf_model.eval()
+    hf_model.to(device)
+    del state_dict
+
+    return hf_model
+
+
+def load_model(save_dir, model_type, repo_id):
+    """
+    Meta SeamlessM4Tv2 is made of 8 main components:
+    - speech_encoder (#1) and speech_encoder_frontend (#2)
+    - t2u_model (#3)
+    - text_encoder (#4) and text_encoder_frontend (#5)
+    - text_decoder (#6) [and text_decoder_frontend (#5) = equals to text_encoder_frontend]
+    - final_proj (#7)
+    - vocoder (#8)
+    """
+    device = _grab_best_device()
+    name = "seamlessM4T_v2_large"
+
+    original_model = Translator(name, "vocoder_v2", device, dtype=torch.float32)
+
+    ######### TOKENIZER
+
+    langs = LARGE_SUPPORTED_LANGUAGES
+    langs = [f"__{lang}__" for lang in langs]
+    vocab_file = os.path.join(os.path.expanduser("~"), "tokenizer", model_type, "tokenizer.model")
+
+    save_dir = os.path.join(save_dir, name)
+    Path(save_dir).mkdir(exist_ok=True)
+
+    tokenizer = SeamlessM4TTokenizer(vocab_file, additional_special_tokens=langs)
+
+    sanity_check_lang_id = tokenizer.convert_tokens_to_ids("__fra__")
+
+    tokenizer.save_pretrained(save_dir)
+    tokenizer = SeamlessM4TTokenizer.from_pretrained(save_dir)
+
+    if sanity_check_lang_id != tokenizer.convert_tokens_to_ids("__fra__"):
+        raise ValueError(
+            f"Error in tokenizer saving/loading - __fra__ lang id is not coherent: {sanity_check_lang_id} vs {tokenizer.convert_tokens_to_ids('__fra__')}"
+        )
+
+    ####### get language to ids dict
+    text_decoder_lang_code_to_id = {lang.replace("__", ""): tokenizer.convert_tokens_to_ids(lang) for lang in langs}
+    # offset: vocoder unit vocab size + 5 (for EOS/PAD/BOS/UNK/MSK) + len(supported_languages)
+    t2u_lang_code_to_id = {
+        code.replace("__", ""): i + 10005 + len(UNIT_SUPPORTED_LANGUAGES)
+        for i, code in enumerate(UNIT_SUPPORTED_LANGUAGES)
+    }
+    vocoder_lang_code_to_id = {code.replace("__", ""): i for i, code in enumerate(VOCODER_SUPPORTED_LANGUAGES)}
+
+    ######### FE
+
+    fe = SeamlessM4TFeatureExtractor(language_code=langs)
+
+    fe.save_pretrained(save_dir)
+    fe = SeamlessM4TFeatureExtractor.from_pretrained(save_dir)
+
+    processor = SeamlessM4TProcessor(feature_extractor=fe, tokenizer=tokenizer)
+    processor.save_pretrained(save_dir)
+    processor.push_to_hub(repo_id=repo_id, create_pr=True)
+
+    processor = SeamlessM4TProcessor.from_pretrained(save_dir)
+
+    ######## Model
+
+    # init config
+    hf_config = _load_hf_config()
+
+    ######## get id_to_text and char_to_id from original model tokenizers
+    id_to_text = {i: original_model.text_tokenizer.model.index_to_token(i) for i in range(hf_config.vocab_size)}
+    char_to_id = {
+        original_model.model.t2u_model.decoder_frontend.char_tokenizer.model.index_to_token(i): i for i in range(10904)
+    }
+
+    # init model
+    hf_model = SeamlessM4Tv2Model(hf_config)
+
+    hf_model.generation_config.__setattr__("text_decoder_lang_to_code_id", text_decoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("t2u_lang_code_to_id", t2u_lang_code_to_id)
+    hf_model.generation_config.__setattr__("vocoder_lang_code_to_id", vocoder_lang_code_to_id)
+    hf_model.generation_config.__setattr__("id_to_text", id_to_text)
+    hf_model.generation_config.__setattr__("char_to_id", char_to_id)
+
+    # -1. take care of vocoder
+    # similarly to speech T5 must apply and remove weight norm
+    hf_model.vocoder.apply_weight_norm()
+    hf_model.vocoder = _convert_model(
+        original_model,
+        hf_model.vocoder,
+        vocoder_convert_list,
+        device,
+        unwanted_prefix="vocoder.code_generator.",
+        filter_state_dict="vocoder",
+    )
+    hf_model.vocoder.remove_weight_norm()
+
+    # 1. take care of speech encoder
+    wav2vec = hf_model.speech_encoder
+    hf_model.speech_encoder = _convert_model(
+        original_model, wav2vec, wav2vec_convert_list, device, unwanted_prefix="model.", filter_state_dict="speech"
+    )
+
+    # 2. take care of t2u
+
+    hf_model.t2u_model = _convert_model(
+        original_model,
+        hf_model.t2u_model,
+        t2u_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict="t2u_model",
+    )
+
+    # 3. take care of text encoder
+    hf_model.text_encoder = _convert_model(
+        original_model,
+        hf_model.text_encoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_encoder"],
+        exclude_state_dict="t2u_model",
+    )
+
+    # 4. take care of text decoder
+    hf_model.text_decoder = _convert_model(
+        original_model,
+        hf_model.text_decoder,
+        text_convert_list,
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.text_decoder"],
+        exclude_state_dict="t2u_model",
+    )
+
+    # 5. take care of final proj
+    hf_model.lm_head = _convert_model(
+        original_model,
+        hf_model.lm_head,
+        [("final_proj.", "")],
+        device,
+        unwanted_prefix="model.",
+        filter_state_dict=["model.final_proj"],
+        exclude_state_dict="t2u_model",
+    )
+
+    # sanity check
+    print(find_tied_parameters(hf_model))
+
+    count_1 = param_count(hf_model)
+    count_2 = param_count(original_model)
+
+    print(f"HF MODEL:{count_1}, ORIGINAL_MODEL: {count_2}, diff:{count_1 - count_2}")
+    print(f"HF MODEL excluding embeddings:{hf_model.num_parameters(exclude_embeddings=True)}")
+
+    del original_model
+
+    hf_model.generation_config._from_model_config = False
+    hf_model.save_pretrained(save_dir)
+    hf_model.push_to_hub(repo_id=repo_id, create_pr=True)
+    hf_model = SeamlessM4Tv2Model.from_pretrained(save_dir)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+
+    parser.add_argument(
+        "--model_type",
+        default="large",
+        type=str,
+        help="Model type.",
+    )
+
+    parser.add_argument(
+        "--save_dir",
+        default="/home/ubuntu/weights_v2",
+        type=str,
+        help="Path to the output PyTorch model.",
+    )
+
+    parser.add_argument(
+        "--repo_id",
+        default="facebook/seamless-m4t-v2-large",
+        type=str,
+        help="Repo ID.",
+    )
+
+    args = parser.parse_args()
+
+    load_model(args.save_dir, args.model_type, args.repo_id)
diff --git a/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
new file mode 100644
index 00000000000000..f1a26b3e5b6924
--- /dev/null
+++ b/src/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -0,0 +1,4799 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch SeamlessM4Tv2 model."""
+
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...deepspeed import is_deepspeed_zero3_enabled
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = ""
+_CONFIG_FOR_DOC = "SeamlessM4Tv2Config"
+
+SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/seamless-m4t-v2-large",
+    # See all SeamlessM4T-v2 models at https://huggingface.co/models?filter=seamless_m4t_v2
+]
+
+
+SPEECHT5_PRETRAINED_HIFIGAN_CONFIG_ARCHIVE_MAP = {
+    "microsoft/speecht5_hifigan": "https://huggingface.co/microsoft/speecht5_hifigan/resolve/main/config.json",
+}
+
+
+@dataclass
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TGenerationOutput with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2GenerationOutput(ModelOutput):
+    """
+    Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`],
+    [`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`].
+
+    Args:
+        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            The final audio waveform predicted by the model.
+        waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
+            The length in samples of each element in the `waveform` batch.
+        sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
+            The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
+            early due to the `eos_token_id`.
+        unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
+            The generated translated unit sequences. This is the output of the text-to-units model. The second
+            dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
+            early due to the `t2u_eos_token_id`.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+    waveform_lengths: Optional[torch.IntTensor] = None
+    sequences: Optional[Tuple[torch.FloatTensor]] = None
+    unit_sequences: Optional[Tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
+    """
+    Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+            for *masked*
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    padding_mask: Optional[torch.Tensor] = None
+
+
+@dataclass
+class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
+    """
+        Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
+        [`SeamlessM4Tv2TextToUnitModel`].
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+            for *masked*
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+    """
+
+    last_hidden_state: torch.FloatTensor = None
+    padding_mask: Optional[torch.Tensor] = None
+    decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
+
+
+SEAMLESS_M4T_V2_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`~SeamlessM4Tv2Config`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+    """
+
+M4T_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+
+M4T_SPEECH_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+            Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+            [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+        """
+
+SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING = r"""
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+            is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+            For translation and summarization training, `decoder_input_ids` should be provided. If no
+            `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+            for denoising pre-training following the paper.
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+
+            If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value
+            of `inputs_embeds`.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+M4T_MODEL_INPUTS_DOCSTRING = SEAMLESS_M4T_V2_MULTIMODAL_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_TEXT_INPUTS_DOCSTRING = M4T_TEXT_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_SPEECH_INPUTS_DOCSTRING = M4T_SPEECH_INPUTS_DOCSTRING + SEAMLESS_M4T_V2_END_INPUTS_DOCSTRING
+
+M4T_TEXT_TO_UNITS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+            Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+            dictionary in the generation configuration.
+        char_count_per_id (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Number of characters per input id.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+############ UTILS ################
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.format_speech_generation_kwargs with SeamlessM4T->SeamlessM4Tv2
+def format_speech_generation_kwargs(kwargs):
+    """
+    Format kwargs for SeamlessM4Tv2 models that generate speech, attribute kwargs to either the text generation or the
+    speech generation models.
+
+    Args:
+        kwargs (`dict`)`:
+             Keyword arguments are of two types:
+
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                except for `decoder_input_ids` which will only be passed through the text components.
+                - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                This means you can, for example, specify a generation strategy for one generation but not for the
+                other.
+    """
+    # attribute kwargs to models
+    kwargs_text = {}
+    kwargs_speech = {}
+    for key, value in kwargs.items():
+        if key.startswith("text_"):
+            key = key[len("text_") :]
+            kwargs_text[key] = value
+        elif key.startswith("speech_"):
+            key = key[len("speech_") :]
+            kwargs_speech[key] = value
+        else:
+            # If the key is already in a specific config, then it's been set with a
+            # submodules specific value and we don't override
+            if key not in kwargs_text:
+                kwargs_text[key] = value
+            if key not in kwargs_speech:
+                kwargs_speech[key] = value
+    return kwargs_text, kwargs_speech
+
+
+############ SPEECH ENCODER related code ################
+
+
+class SeamlessM4Tv2ConformerFeatureProjection(nn.Module):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeatureProjection.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states.to(self.layer_norm.weight.dtype))
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeedForward with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, dropout=None):
+        super().__init__()
+        dropout = dropout if dropout is not None else config.speech_encoder_dropout
+        act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
+
+        self.intermediate_dropout = nn.Dropout(dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block. Uses a causal depthwise convolution similar to that
+    described in Section 2.1 of `https://doi.org/10.48550/arxiv.1609.03499"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.activation = ACT2FN[config.speech_encoder_hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
+    """Construct a SeamlessM4Tv2ConformerSelfAttention object.
+    Can be enhanced with relative position embeddings.
+    """
+
+    def __init__(self, config, use_position_embeddings=True):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.speech_encoder_attention_heads
+        self.num_heads = config.speech_encoder_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            attn_weights = attn_weights + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # => (batch, head, time1, time2)
+        attn_weights = torch.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+
+        # => (batch, head, time1, d_k)
+        attn_output = torch.matmul(attn_weights, value)
+
+        # => (batch, time1, hidden_size)
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        attn_output = self.linear_out(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4Tv2, attention_dropout->speech_encoder_dropout, torch.nn->nn
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.speech_encoder_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = SeamlessM4Tv2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = SeamlessM4Tv2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = SeamlessM4Tv2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        hidden_states = hidden_states
+
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+        self.layers = nn.ModuleList(
+            [SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
+        )
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+    def _apply_chunk_attention(self, attention_mask, hidden_states):
+        """
+        Creates a chunk attention mask. It creates a mask to prevent attention across chunks, ensuring that each
+        position attends only to positions within its own chunk. If a left chunk overlap is specified
+        (`speech_encoder_chunk_size` in the configuration), the attention mask is adjusted accordingly to allow each
+        position to also attends the `speech_encoder_chunk_size - 1` previous chunks.
+        """
+        sequence_len = hidden_states.shape[1]
+
+        chunk_indices = torch.arange(sequence_len, device=hidden_states.device)
+        chunk_indices = torch.div(chunk_indices, self.config.speech_encoder_chunk_size).long()
+
+        start_indices = torch.full_like(chunk_indices, 0)
+        if self.config.speech_encoder_left_chunk_num >= 0:
+            start_indices = (chunk_indices - self.config.speech_encoder_left_chunk_num).clamp_(min=0)
+            start_indices = start_indices * self.config.speech_encoder_chunk_size
+            start_indices = start_indices
+        start_indices = start_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        end_indices = ((chunk_indices + 1) * self.config.speech_encoder_chunk_size).clamp_(max=sequence_len)
+
+        end_indices = end_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        indices = torch.arange(sequence_len, device=hidden_states.device).unsqueeze(0).expand(sequence_len, -1)
+
+        chunk_mask = (indices < start_indices) | (indices >= end_indices)
+        chunk_mask = chunk_mask.unsqueeze(0).unsqueeze(0)
+
+        attention_mask = chunk_mask if attention_mask is None else (attention_mask.bool() | chunk_mask)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)
+        return attention_mask
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        if self.config.speech_encoder_chunk_size is not None:
+            attention_mask = self._apply_chunk_attention(attention_mask, hidden_states)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+
+        hidden_states = self.dropout(hidden_states)
+
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = (
+                True if self.training and (dropout_probability < self.config.speech_encoder_layerdrop) else False
+            )
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        layer.__call__,
+                        hidden_states,
+                        attention_mask,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        output_attentions=output_attentions,
+                        conv_attention_mask=conv_attention_mask,
+                    )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapterLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.adaptor_dropout
+
+        self.kernel_size = config.adaptor_kernel_size
+        self.stride = config.adaptor_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config, use_position_embeddings=False)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=dropout)
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        pad = self.kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+
+        return seq_lens.floor()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weigths = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapter with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            SeamlessM4Tv2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)
+        )
+
+    def forward(self, hidden_states, attention_mask):
+        # down project hidden_states if necessary
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        return hidden_states
+
+
+############ TEXT / UNITS related code ################
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.float) * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self, input_ids: torch.Tensor = None, inputs_embeds: torch.Tensor = None, past_key_values_length: int = 0
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class SeamlessM4Tv2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4Tv2
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[SeamlessM4Tv2Config] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, projection: torch.Tensor) -> torch.Tensor:
+        new_projection_shape = projection.size()[:-1] + (self.num_heads, self.head_dim)
+        # move heads to 2nd position (B, T, H * D) -> (B, T, H, D) -> (B, H, T, D)
+        new_projection = projection.view(new_projection_shape).permute(0, 2, 1, 3)
+        return new_projection
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # use encoder_hidden_states if cross attention
+        current_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        # checking that the `sequence_length` of the `past_key_value` is the same as the he provided
+        # `encoder_hidden_states` to support prefix tuning
+        if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        else:
+            key_states = self._shape(self.k_proj(current_states))
+            value_states = self._shape(self.v_proj(current_states))
+            if past_key_value is not None and not is_cross_attention:
+                # reuse k, v, self_attention
+                key_states = torch.cat([past_key_value[0], key_states], dim=2)
+                value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+        query_states = self._shape(self.q_proj(hidden_states) * self.scaling)
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        #  attn_output = torch.bmm(attn_probs, value_states) ?
+        context_states = torch.matmul(attn_weights, value_states)
+        # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(context_states)
+
+        if output_attentions:
+            return attn_output, attn_weights, past_key_value
+        else:
+            return attn_output, None, past_key_value
+
+
+# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4Tv2,DenseActDense->FeedForwardNetwork, d_model->hidden_size
+class SeamlessM4Tv2FeedForwardNetwork(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, ffn_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act = ACT2FN[config.activation_function]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.fc2.weight, torch.Tensor)
+            and hidden_states.dtype != self.fc2.weight.dtype
+            and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
+        ):
+            hidden_states = hidden_states.to(self.fc2.weight.dtype)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2EncoderLayer(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None):
+        super().__init__()
+        encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
+        encoder_attention_heads = (
+            config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2DecoderLayer(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.attn_dropout = nn.Dropout(config.dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attention = SeamlessM4Tv2Attention(
+            self.embed_dim, decoder_attention_heads, config.attention_dropout, is_decoder=True
+        )
+        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`):
+                encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+                very large negative values.
+            past_key_value (`Tuple(torch.FloatTensor)`):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.cross_attention(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_value=cross_attn_past_key_value,
+                attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.attn_dropout(hidden_states)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value += cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states, present_key_value)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class SeamlessM4Tv2TextToUnitDecoderLayer(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+        self.dropout = config.dropout
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.conv1 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+
+        self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.conv_dropout = nn.Dropout(self.dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Conv
+        residual = hidden_states
+
+        # Apply padding mask to avoid leaking padded positions in the convolution layer
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.conv_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.conv_layer_norm(hidden_states)
+
+        outputs = (hidden_states, present_key_value)
+
+        if output_attentions:
+            outputs += self_attn_weights
+
+        return outputs
+
+
+############ SUB-MODELS related code ################
+
+
+class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SeamlessM4Tv2Config
+    base_model_prefix = "seamless_m4t_v2"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "SeamlessM4Tv2EncoderLayer",
+        "SeamlessM4Tv2DecoderLayer",
+        "SeamlessM4Tv2ConformerEncoderLayer",
+        "SeamlessM4Tv2TextToUnitDecoderLayer",
+    ]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, SeamlessM4Tv2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, SeamlessM4Tv2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
+        pad = kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1
+
+        return seq_lens.floor()
+
+    def _indices_to_subwords(self, input_ids):
+        """
+        Returns the corresponding text string for each input id.
+        """
+        if not hasattr(self.generation_config, "id_to_text"):
+            raise ValueError(
+                """This model generation config doesn't have a `id_to_text` key which maps
+                token ids to subwords. Make sure to load the right generation config."""
+            )
+        batch_size, sequence_len = input_ids.shape
+
+        subwords_batch = []
+        for batch_id in range(batch_size):
+            subwords = []
+            for i in range(sequence_len):
+                subword = self.generation_config.id_to_text.get(str(input_ids[batch_id, i].item()))
+                subwords.append(str(subword))
+            subwords_batch.append(subwords)
+        return subwords_batch
+
+    def _count_character_length_in_subword(
+        self,
+        input_ids,
+        subwords_batch,
+        merge_space_with_prev_subword=False,
+        pad_token_id=0,
+        unk_token_id=1,
+        space="▁",
+    ):
+        """
+        Counts the number of characters per text string associated with the input token id.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
+                Indicates if the space character is merged with the previous subword. If `False`, it will be merged
+                with the next subword.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+            space (`str`, *optional*, defaults to `"▁"`):
+                The space character.
+        """
+        batch_size, _ = input_ids.shape
+
+        char_count_per_id = input_ids.new_zeros(input_ids.size())
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            # We slice out the tensor till the padding index.
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+
+            is_next_start_with_space = [
+                len(subwords[i + 1]) > 1 and subwords[i + 1][0] == space if i < len(subwords) - 1 else False
+                for i in range(len(subwords))
+            ]
+            is_punc = [
+                len(subwords[i]) == 1
+                and not subwords[i].isalpha()
+                and not subwords[i].isnumeric()
+                and subwords[i] != space
+                for i in range(len(subwords))
+            ]
+            for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)):
+                if subword_idx == pad_token_id:
+                    break
+
+                if subword_idx == unk_token_id:
+                    # We set char_len to 1 for an unk token.
+                    char_len = 1
+
+                    if merge_space_with_prev_subword and is_next_start_with_space[i]:
+                        char_len += 1
+                else:
+                    # By default, spaces are merged with the next subword.
+                    # char_len includes the space.
+                    char_len = len(subword)
+
+                    if merge_space_with_prev_subword:
+                        # Add the space for the next subword.
+                        if is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the current subword.
+                        if i > 0 and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+                    else:
+                        # Merge space with punctuation mark by default.
+                        if is_punc[i] and is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the subword succeeding the punctuation mark.
+                        elif i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+
+                char_count_per_id[batch_id, i] = char_len
+
+        return char_count_per_id
+
+    def _get_char_input_ids(self, input_ids, subwords_batch, char_count_per_id, pad_token_id=0, unk_token_id=1):
+        """
+        Returns the corresponding character input id for each character of `subwords_batch`.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`List[List[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Number of characters per input id.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+        Returns:
+            `torch.Tensor`: Tensor of shape `(batch_size, char_sequence_length)` containing the id of each character.
+        """
+        if not hasattr(self.generation_config, "char_to_id"):
+            raise ValueError(
+                """This model generation config doesn't have a `char_to_id` key which maps
+                characters to character ids. Make sure to load the right generation config."""
+            )
+
+        batch_size = input_ids.shape[0]
+        max_len = int(char_count_per_id.sum(1).max().item())
+
+        char_seqs = input_ids.new_zeros((batch_size, max_len)).fill_(pad_token_id)
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            total = 0
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+            for subword_idx, subword in zip(subword_indices, subwords):
+                if subword_idx == unk_token_id:
+                    char_ids = [unk_token_id]
+                else:
+                    # Get char token indices corresponding to the subwords.
+                    char_ids = [self.generation_config.char_to_id.get(ch, unk_token_id) for ch in list(subword)]
+                char_seq_len = len(char_ids)
+                char_seqs[batch_id, total : total + char_seq_len] = torch.tensor(char_ids).to(char_seqs)
+                total += char_seq_len
+        return char_seqs
+
+    def _hard_upsample(self, hidden_states, durations):
+        """
+        Repeats the time dimension of each sample in the batch based on the corresponding duration.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, *)`, *optional*):
+                The sequence to repeat, where `*` is any number of sequence-specific dimensions including none.
+            durations (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates how many times to repeat time segments.
+        """
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, durations.view(-1), dim=1)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning_once(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=0)
+                for (hidden_state, duration) in zip(hidden_states, durations)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True)
+
+        return hidden_states
+
+
+@add_start_docstrings(
+    """Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers.
+    Each layer is a [`SeamlessM4Tv2ConformerEncoderLayer`].""",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel):
+    main_input_name = "input_features"
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.feature_projection = SeamlessM4Tv2ConformerFeatureProjection(config)
+        self.encoder = SeamlessM4Tv2ConformerEncoder(config)
+        self.intermediate_ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=0.0)
+        self.adapter = SeamlessM4Tv2ConformerAdapter(config) if config.add_adapter else None
+        self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError(
+                """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4Tv2SpeechEncoder.forward`.
+                Make sure one of them is not `None`."""
+            )
+
+        hidden_states = self.feature_projection(input_features)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        expanded_hidden_states = self.intermediate_ffn(hidden_states)
+        hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.inner_layer_norm(hidden_states)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# inspired from MBart and NllbMoe
+@add_start_docstrings(
+    "Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4Tv2EncoderLayer`].",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        is_t2u_encoder (`bool`, *optional*, defaults to `False`):
+            indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
+    """,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+        is_t2u_encoder: bool = False,
+    ):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        embed_dim = config.hidden_size
+
+        self.is_t2u_encoder = is_t2u_encoder
+        self.max_source_positions = config.max_position_embeddings
+
+        if not self.is_t2u_encoder:
+            self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+            if embed_tokens is not None:
+                self.embed_tokens.weight = embed_tokens.weight
+
+            self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+                self.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+            )
+
+        layers = []
+        for _ in range(config.encoder_layers):
+            layers.append(
+                SeamlessM4Tv2EncoderLayer(
+                    config,
+                    encoder_attention_heads=config.encoder_attention_heads,
+                    encoder_ffn_dim=config.encoder_ffn_dim,
+                )
+            )
+
+        self.layers = nn.ModuleList(layers)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and self.is_t2u_encoder:
+            raise ValueError(
+                "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead."
+            )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        if not self.is_t2u_encoder:
+            embed_pos = self.embed_positions(input)
+
+            hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
+        else:
+            hidden_states = inputs_embeds
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.forward,
+                        hidden_states,
+                        attention_mask,
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        attention_mask,
+                        output_attentions=output_attentions,
+                    )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+    """,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for _ in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2DecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`. inputs_embeds (`torch.FloatTensor` of
+                shape `(batch_size, sequence_length, hidden_size)`, *optional*): Optionally, instead of passing
+                `input_ids` you can choose to directly pass an embedded representation. This is useful if you want more
+                control over how to convert `input_ids` indices into associated vectors than the model's internal
+                embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    encoder_hidden_states,
+                    encoder_attention_mask,
+                    None,
+                    output_attentions,
+                    use_cache,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    past_key_value=past_key_value,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[3],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+    """,
+)
+class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.embed_char = nn.Embedding(config.char_vocab_size, config.hidden_size)
+        self.embed_char_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.pos_emb_alpha_char = nn.Parameter(torch.ones(1))
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+        self.duration_predictor = SeamlessM4Tv2VariancePredictor(
+            config.variance_predictor_embed_dim,
+            config.variance_predictor_hidden_dim,
+            config.variance_predictor_kernel_size,
+            config.variance_pred_dropout,
+        )
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for _ in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2TextToUnitDecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.embed_tokens = value
+
+    def forward(
+        self,
+        char_input_ids: torch.LongTensor = None,
+        char_count_per_id: torch.LongTensor = None,
+        encoder_hidden_states: torch.FloatTensor = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
+        r"""
+        Args:
+            char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+                Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+                dictionary in the generation configuration.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, encoder_sequence_length)`):
+                Number of characters per text input id.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # create padding mask for character lengths
+        char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1))
+
+        # upsample hidden states according to characters sequence lengths
+        char_hidden_states = self._hard_upsample(encoder_hidden_states, char_count_per_id)
+        # embed char positions
+        char_positions = self.pos_emb_alpha_char * self.embed_char_positions(inputs_embeds=char_hidden_states)
+        # update char hidden states with positions and char embeddings
+        char_hidden_states = self.embed_char(char_input_ids) * self.embed_scale + char_positions + char_hidden_states
+
+        # predict duration
+        log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
+        dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
+        dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
+
+        # upsample char hidden states according to predicted duration
+        char_hidden_states = self._hard_upsample(char_hidden_states, dur_out)
+
+        positions = self.pos_emb_alpha * self.embed_positions(inputs_embeds=char_hidden_states)
+        hidden_states = char_hidden_states + positions
+
+        padding_mask = _compute_new_attention_mask(hidden_states, dur_out.sum(1))
+        attention_mask = _prepare_4d_attention_mask(padding_mask, hidden_states.dtype)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    decoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    padding_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    padding_mask=padding_mask,
+                    output_attentions=output_attentions,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns, padding_mask] if v is not None)
+        return SeamlessM4Tv2TextToUnitDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            padding_mask=padding_mask,
+        )
+
+
+@add_start_docstrings(
+    "Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4Tv2Encoder`] without embeddings and the decoder is a [`SeamlessM4Tv2TextToUnitDecoder`].",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
+    """,
+)
+class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.__init__ with SeamlessM4T->SeamlessM4Tv2, Decoder->TextToUnitDecoder
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        super().__init__(config)
+
+        self.encoder = SeamlessM4Tv2Encoder(config, is_t2u_encoder=True)
+        self.decoder = SeamlessM4Tv2TextToUnitDecoder(config, embed_tokens_decoder)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: torch.LongTensor = None,
+        char_count_per_id: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn, padding_mask)
+        decoder_outputs = self.decoder(
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            padding_mask=decoder_outputs.padding_mask,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4Tv2TextToUnitModel`].",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        embed_tokens_decoder (`nn.Embedding`, *optional*): input embedding of the decoder.
+    """,
+)
+class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = [
+        "vocoder",
+        "speech_encoder",
+        "text_encoder",
+        "text_decoder",
+    ]
+    _tied_weights_keys = ["decoder.embed_tokens.weight", "lm_head.weight"]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        # update config - used principaly for bos_token_id etc.
+        config = copy.deepcopy(config)
+        for param, val in config.to_dict().items():
+            if param.startswith("t2u_"):
+                config.__setattr__(param[4:], val)
+        super().__init__(config)
+
+        self.model = SeamlessM4Tv2TextToUnitModel(config, embed_tokens_decoder)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.model.encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.model.decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    @add_start_docstrings_to_model_forward(M4T_TEXT_TO_UNITS_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        char_input_ids: torch.LongTensor = None,
+        char_count_per_id: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            attention_mask=attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=lm_logits,
+            padding_mask=outputs.padding_mask,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loss=masked_lm_loss,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration._tie_weights
+    def _tie_weights(self) -> None:
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings = self.get_output_embeddings()
+            if output_embeddings is not None:
+                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+
+############ VOCODER related code ################
+
+
+HIFIGAN_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`SeamlessM4Tv2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class SeamlessM4Tv2VariancePredictor(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
+        super().__init__()
+
+        self.conv1 = nn.Conv1d(
+            embed_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.activation_fuction = nn.ReLU()
+        self.ln1 = nn.LayerNorm(hidden_dim)
+        self.dropout_module = nn.Dropout(p=var_pred_dropout)
+        self.conv2 = nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.ln2 = nn.LayerNorm(hidden_dim)
+        self.proj = nn.Linear(hidden_dim, 1)
+
+    def forward(self, hidden_states: Tensor, padding_mask: Tensor = None) -> Tensor:
+        # Input: B x T x C; Output: B x T
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln1(hidden_states))
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_fuction(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln2(hidden_states))
+        return self.proj(hidden_states).squeeze(dim=2)
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4THifiGan with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2HifiGan(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__()
+        model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
+        self.leaky_relu_slope = config.leaky_relu_slope
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+    def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
+                is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+
+        hidden_states = self.conv_pre(input_embeds)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        # remove seq-len dim since this collapses to 1
+        waveform = hidden_states.squeeze(1)
+
+        return waveform
+
+
+@add_start_docstrings(
+    """Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).""",
+    HIFIGAN_START_DOCSTRING,
+)
+class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
+    config_class = SeamlessM4Tv2Config
+    main_input_name = "input_embeds"
+    _no_split_modules = []
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.pad_token_id = config.t2u_pad_token_id
+        embed_dim = config.unit_embed_dim
+        kernel_size = config.variance_predictor_kernel_size
+        var_pred_dropout = config.var_pred_dropout
+        self.dur_predictor = SeamlessM4Tv2VariancePredictor(embed_dim, embed_dim, kernel_size, var_pred_dropout)
+
+        self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
+        self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
+        self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
+
+        self.hifi_gan = SeamlessM4Tv2HifiGan(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_dur_output_lengths
+    def _get_dur_output_lengths(self, input_ids, dur_out):
+        """
+        Computes the output length after the duration layer.
+        """
+        unit_lengths = (input_ids != self.pad_token_id).sum(1)
+
+        # take care of edge cases where no padding or too many padding
+        unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
+
+        cumulative_dur_out = torch.cumsum(dur_out, dim=1)
+        unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
+
+        return unit_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths
+    def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the hifigan convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (
+                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+            )
+
+        def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
+
+        # conv_pre
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        # upsampler
+        for i, (upsample_rate, kernel_size) in enumerate(
+            zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
+        ):
+            input_lengths = _transpose_conv_out_length(
+                input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
+            )
+
+        # resblock
+        for i in range(len(self.config.upsample_rates)):
+            for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
+                for dil in dilation:
+                    input_lengths = _conv_out_length(
+                        input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
+                    )
+
+                for dil in dilation:
+                    input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
+
+        # conv_post
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        return input_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
+    def forward(
+        self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
+    ) -> Tuple[torch.Tensor]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4Tv2TextToUnitForConditionalGeneration`]. [What are input
+                IDs?](../glossary#input-ids)
+            speaker_id (`int`, *optional*):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            tgt_lang (`str`, *optional*):
+                The language id to use as target language for translation.
+        """
+        hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
+        spkr = self.speaker_embedding(speaker_id).transpose(1, 2)
+        lang = self.language_embedding(lang_id).transpose(1, 2)
+
+        log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
+        dur_out = torch.clamp(torch.round((torch.exp(log_dur_pred) - 1)).long(), min=1)
+        # B x C x T
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
+                for (hidden_state, duration) in zip(hidden_states, dur_out)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
+
+        spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
+        lang = lang.repeat(1, 1, hidden_states.shape[-1])
+        hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
+
+        hidden_states = self.hifi_gan(hidden_states)
+
+        unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
+        lengths = self._get_output_hifigan_lengths(unit_lengths)
+
+        return hidden_states, lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            nn.utils.weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.hifi_gan.conv_post)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
+
+
+############ WHOLE MODEL related code ################
+
+
+@add_start_docstrings(
+    "The text-to-text SeamlessM4Tv2 Model transformer which can be used for T2TT.",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToText with SeamlessM4T->SeamlessM4Tv2,SeamlessM4Tv2Tokenizer->SeamlessM4TTokenizer, SeamlessM4Tv2Processor->SeamlessM4TProcessor
+class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+
+                - [`~generation.GreedySearchEncoderDecoderOutput`],
+                - [`~generation.SampleEncoderDecoderOutput`],
+                - [`~generation.BeamSearchEncoderDecoderOutput`],
+                - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        # prepare text_decoder_input_ids
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+
+        return super().generate(
+            input_ids,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    "The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["text_decoder", "t2u_model", "vocoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward
+    def forward(
+        self,
+        input_features: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.generate
+    def generate(
+        self,
+        input_features=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://arxiv.org/abs/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            kwargs (`Dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+
+                - [`~generation.GreedySearchEncoderDecoderOutput`],
+                - [`~generation.SampleEncoderDecoderOutput`],
+                - [`~generation.BeamSearchEncoderDecoderOutput`],
+                - [`~generation.BeamSampleEncoderDecoderOutput`]
+        """
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            inputs = kwargs.get("input_embeds") if input_features is None else input_features
+            inputs = (
+                inputs
+                if inputs is not None
+                else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"]
+            )
+            batch_size = len(inputs)
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {', '.join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    "The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["speech_encoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_encoder
+    def get_encoder(self):
+        return self.text_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @add_start_docstrings_to_model_forward(M4T_TEXT_INPUTS_DOCSTRING)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForTextToText`."
+                "It doesn't use the text-to-unit model `SeamlessM4Tv2TextToUnitForConditionalGeneration`."
+                "If you want to generate speech, use the `.generate` method."
+            )
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_ids, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+        encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    "The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+)
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel):
+    _keys_to_ignore_on_load_missing = ["text_encoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @add_start_docstrings_to_model_forward(M4T_SPEECH_INPUTS_DOCSTRING)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_features: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForSpeechToText`. It doesn't use `self.t2u_model`."
+                "If you want to generate speech, use the `generate` method."
+            )
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_features, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get last_hidden_state from encoder
+        encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0]
+
+        # input modality = speech so new attention mask for the decoder
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+            )
+
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+    @staticmethod
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+
+@add_start_docstrings(
+    "The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).",
+    SEAMLESS_M4T_V2_START_DOCSTRING,
+    """
+        current_modality (`str`, *optional*, defaults to `"text"`):
+            Default modality. Used only to initialize the model. It can be set to `"text"` or `"speech"`.
+            This will be updated automatically according to the modality passed to the forward and generate passes (`input_ids` for text and `input_features` for audio).
+    """,
+)
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel):
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config, current_modality="text"):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.current_modality = current_modality
+        if current_modality == "speech":
+            self.main_input_name = "input_features"
+
+        # these models already call post_init in their initialization
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_modality
+    def set_modality(self, modality="text"):
+        if modality == "text":
+            self.main_input_name = "input_ids"
+            self.current_modality = "text"
+        elif modality == "speech":
+            self.main_input_name = "input_features"
+            self.current_modality = "speech"
+        else:
+            raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_encoder
+    def get_encoder(self):
+        if self.current_modality == "text":
+            return self.text_encoder
+        else:
+            return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_output_embeddings
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_output_embeddings
+    def set_output_embeddings(self, new_embeddings):
+        self.lm_head = new_embeddings
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @add_start_docstrings_to_model_forward(M4T_MODEL_INPUTS_DOCSTRING)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, Tuple[torch.FloatTensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None:
+            raise ValueError(
+                "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not."
+            )
+        elif input_features is not None:
+            if input_ids is not None:
+                logger.warning(
+                    "`input_ids` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through the `speech_encoder`. "
+                    "Make sure that `input_features` and `input_ids` are mutually exclusive."
+                )
+
+            if inputs_embeds is not None:
+                logger.warning(
+                    "`inputs_embeds` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through `speech_encoder`. "
+                    "`inputs_embeds` will be ignored."
+                )
+
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+
+            self.set_modality("speech")
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        elif input_ids is not None or inputs_embeds is not None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+            self.set_modality("text")
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        # input modality = speech so new attention mask
+        if self.current_modality == "speech" and attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        generate_speech: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated token ids and/or translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively
+        perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
+                Input audio features. This should be returnes by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio. Note that if `generate_speech=True`, this parameter will be
+                ignored.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            generate_speech (`bool`, *optional*, defaults to `True`):
+                If `False`, will only returns the text tokens and won't generate speech.
+
+            kwargs (*optional*):
+                Remaining dictioy of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, Tuple[Tensor], ModelOutput]`:
+            - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
+              shape `(batch_size, sequence_length)`and and `waveform_lengths` which gives the length of each sample.
+            - If `generate_speech=False`, it will returns `ModelOutput`.
+        """
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds", None) is None:
+            raise ValueError(
+                "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
+            )
+
+        if generate_speech and tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+
+        if tgt_lang is not None:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {','.join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        batch_size = (
+            len(input_features)
+            if input_features is not None
+            else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")))
+        )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            # tgt_lang gets priority over decoder input ids
+            text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size).to(self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        if input_features is not None:
+            self.set_modality("speech")
+            if input_ids is not None:
+                logger.warning(
+                    "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority "
+                    "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder."
+                )
+            text_generation_output = super().generate(input_features=input_features, **kwargs_text)
+        else:
+            self.set_modality("text")
+            text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        if not generate_speech:
+            return text_generation_output
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get encoder last hidden states
+        if self.current_modality == "speech":
+            # get last_hidden_state from encoder - must do a pass through the speech encoder
+            encoder_hidden_states = self.speech_encoder(
+                input_features=input_features, attention_mask=attention_mask
+            ).last_hidden_state
+
+            # input modality = speech so new attention mask for the decoder
+            if attention_mask is not None:
+                sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                    encoder_hidden_states.device
+                )
+                attention_mask = _compute_new_attention_mask(
+                    hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+                )
+        else:
+            encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids)).to(self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids)).to(self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._reorder_cache
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx) for past_state in layer_past[:2]) + layer_past[2:],
+            )
+        return reordered_past
diff --git a/src/transformers/models/speech_to_text/modeling_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
index 57c74c8c42e2a6..76e088415ab88d 100755
--- a/src/transformers/models/speech_to_text/modeling_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_speech_to_text.py
@@ -30,7 +30,12 @@
     Seq2SeqModelOutput,
 )
 from ...modeling_utils import PreTrainedModel
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_speech_to_text import Speech2TextConfig
 
 
@@ -326,7 +331,7 @@ def forward(
         return attn_output, attn_weights_reshaped, past_key_value
 
 
-SPEECH_TO_TEXT_ATTENTION_CLASSES = {"default": Speech2TextAttention}
+SPEECH_TO_TEXT_ATTENTION_CLASSES = {"eager": Speech2TextAttention}
 
 
 # Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Speech2Text, MBART->SPEECH_TO_TEXT
@@ -334,9 +339,8 @@ class Speech2TextEncoderLayer(nn.Module):
     def __init__(self, config: Speech2TextConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[attn_type](
+        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -406,9 +410,8 @@ class Speech2TextDecoderLayer(nn.Module):
     def __init__(self, config: Speech2TextConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[attn_type](
+        self.self_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -421,7 +424,7 @@ def __init__(self, config: Speech2TextConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = SPEECH_TO_TEXT_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
diff --git a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
index 026d2241b461ea..4c6d2ffcb3e014 100755
--- a/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
+++ b/src/transformers/models/speech_to_text/modeling_tf_speech_to_text.py
@@ -1460,3 +1460,9 @@ def prepare_inputs_for_generation(
             "cross_attn_head_mask": cross_attn_head_mask,
             "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
         }
+
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "lm_head.weight":
+            return tf_weight, "model.decoder.embed_tokens.weight"
+        else:
+            return (tf_weight,)
diff --git a/src/transformers/models/switch_transformers/modeling_switch_transformers.py b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
index 60c100d37c48b6..3a6d19c3478d98 100644
--- a/src/transformers/models/switch_transformers/modeling_switch_transformers.py
+++ b/src/transformers/models/switch_transformers/modeling_switch_transformers.py
@@ -169,17 +169,8 @@ def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> Tuple[to
         hidden_states = hidden_states.to(self.dtype)
 
         if self.jitter_noise > 0:
-            # Get the lower and upper bound of the uniform distribution
-            # Adapted from: https://stackoverflow.com/questions/44328530/how-to-get-a-uniform-distribution-in-a-range-r1-r2-in-pytorch
-            distrib_lower_bound = 1.0 - self.jitter_noise
-            distrib_upper_bound = 1.0 + self.jitter_noise
-
-            uniform_distrib = torch.rand(hidden_states.shape, device=hidden_states.device, dtype=self.dtype)
-            uniform_distrib = uniform_distrib * (distrib_lower_bound - distrib_upper_bound)
-
-            uniform_distrib = uniform_distrib + distrib_upper_bound
             # Multiply the token inputs by the uniform distribution - adding some noise
-            hidden_states *= uniform_distrib
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
 
         # Shape: [num_groups, tokens_per_group, num_experts]
         self._cast_classifier()
diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py
index 922d9b67105fc6..4ff1cf29c03902 100644
--- a/src/transformers/models/t5/tokenization_t5.py
+++ b/src/transformers/models/t5/tokenization_t5.py
@@ -183,7 +183,7 @@ def __init__(
         self._added_tokens_decoder = {}
         for i in range(len(extra_tokens)):
             self._added_tokens_decoder[len(self.sp_model) - 1 + extra_ids - i] = AddedToken(
-                f"<extra_id_{i}>", single_word=True, lstrip=True, rstrip=True, special=True
+                f"<extra_id_{i}>", single_word=False, lstrip=True, rstrip=True, special=True, normalized=False
             )
 
         if legacy is None:
diff --git a/src/transformers/models/table_transformer/modeling_table_transformer.py b/src/transformers/models/table_transformer/modeling_table_transformer.py
index 92aac58e745698..81afcdc9c18f81 100644
--- a/src/transformers/models/table_transformer/modeling_table_transformer.py
+++ b/src/transformers/models/table_transformer/modeling_table_transformer.py
@@ -1431,7 +1431,7 @@ def forward(
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
index 904c02b4f04308..b6e86735c6a3d0 100644
--- a/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
+++ b/src/transformers/models/time_series_transformer/modeling_time_series_transformer.py
@@ -32,7 +32,12 @@
 )
 from ...modeling_utils import PreTrainedModel
 from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
 from .configuration_time_series_transformer import TimeSeriesTransformerConfig
 
 
@@ -83,67 +88,66 @@ def forward(self, features: torch.Tensor) -> torch.Tensor:
 
 class TimeSeriesStdScaler(nn.Module):
     """
-    Standardize features by calculating the mean and scaling along some given dimension `dim`, and then normalizes it
-    by subtracting from the mean and dividing by the standard deviation.
-
-    Args:
-        dim (`int`):
-            Dimension along which to calculate the mean and standard deviation.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        minimum_scale (`float`, *optional*, defaults to 1e-5):
-            Default scale that is used for elements that are constantly zero along dimension `dim`.
+    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
+    subtracting from the mean and dividing by the standard deviation.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False, minimum_scale: float = 1e-5):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        if not dim > 0:
-            raise ValueError("Cannot compute scale along dim = 0 (batch dimension), please provide dim > 0")
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5
 
-    @torch.no_grad()
-    def forward(self, data: torch.Tensor, weights: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        denominator = weights.sum(self.dim, keepdim=self.keepdim)
+    def forward(
+        self, data: torch.Tensor, observed_indicator: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
+        denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim)
         denominator = denominator.clamp_min(1.0)
-        loc = (data * weights).sum(self.dim, keepdim=self.keepdim) / denominator
+        loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator
 
-        variance = (((data - loc) * weights) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
+        variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator
         scale = torch.sqrt(variance + self.minimum_scale)
         return (data - loc) / scale, loc, scale
 
 
 class TimeSeriesMeanScaler(nn.Module):
     """
-    Computes a scaling factor as the weighted average absolute value along dimension `dim`, and scales the data
+    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
     accordingly.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
-        default_scale (`float`, *optional*, defaults to `None`):
-            Default scale that is used for elements that are constantly zero. If `None`, we use the scale of the batch.
-        minimum_scale (`float`, *optional*, defaults to 1e-10):
-            Default minimum possible scale that is used for any item.
     """
 
-    def __init__(
-        self, dim: int = -1, keepdim: bool = True, default_scale: Optional[float] = None, minimum_scale: float = 1e-10
-    ):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
-        self.minimum_scale = minimum_scale
-        self.default_scale = default_scale
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
+        self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10
+        self.default_scale = config.default_scale if hasattr(config, "default_scale") else None
 
-    @torch.no_grad()
     def forward(
         self, data: torch.Tensor, observed_indicator: torch.Tensor
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        # shape: (N, [C], T=1)
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                Calculating the scale on the observed indicator.
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True)
         num_observed = observed_indicator.sum(self.dim, keepdim=True)
 
@@ -173,23 +177,26 @@ def forward(
 
 class TimeSeriesNOPScaler(nn.Module):
     """
-    Assigns a scaling factor equal to 1 along dimension `dim`, and therefore applies no scaling to the input data.
-
-    Args:
-        dim (`int`):
-            Dimension along which to compute the scale.
-        keepdim (`bool`, *optional*, defaults to `False`):
-            Controls whether to retain dimension `dim` (of length 1) in the scale tensor, or suppress it.
+    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
     """
 
-    def __init__(self, dim: int, keepdim: bool = False):
+    def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
-        self.dim = dim
-        self.keepdim = keepdim
+        self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1
+        self.keepdim = config.keepdim if hasattr(config, "keepdim") else True
 
     def forward(
-        self, data: torch.Tensor, observed_indicator: torch.Tensor
+        self, data: torch.Tensor, observed_indicator: torch.Tensor = None
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Parameters:
+            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
+                input for Batch norm calculation
+        Returns:
+            tuple of `torch.Tensor` of shapes
+                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
+                `(batch_size, 1, num_input_channels)`)
+        """
         scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim)
         return data, loc, scale
@@ -434,9 +441,8 @@ class TimeSeriesTransformerEncoderLayer(nn.Module):
     def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[attn_type](
+        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -501,7 +507,10 @@ def forward(
         return outputs
 
 
-TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES = {"default": TimeSeriesTransformerAttention}
+# TODO: Implement attention with SDPA for TimeSeriesTransformer.
+TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES = {
+    "eager": TimeSeriesTransformerAttention,
+}
 
 
 # Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->TimeSeriesTransformer, with BART->TIME_SERIES_TRANSFORMER
@@ -510,8 +519,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__()
         self.embed_dim = config.d_model
 
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
-        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[attn_type](
+        self.self_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -524,7 +532,7 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = TIME_SERIES_TRANSFORMER_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -1180,11 +1188,11 @@ def __init__(self, config: TimeSeriesTransformerConfig):
         super().__init__(config)
 
         if config.scaling == "mean" or config.scaling is True:
-            self.scaler = TimeSeriesMeanScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesMeanScaler(config)
         elif config.scaling == "std":
-            self.scaler = TimeSeriesStdScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesStdScaler(config)
         else:
-            self.scaler = TimeSeriesNOPScaler(dim=1, keepdim=True)
+            self.scaler = TimeSeriesNOPScaler(config)
 
         if config.num_static_categorical_features > 0:
             self.embedder = TimeSeriesFeatureEmbedder(
diff --git a/src/transformers/models/timesformer/modeling_timesformer.py b/src/transformers/models/timesformer/modeling_timesformer.py
index 1f201b6a5e440f..73ce6bf7737f62 100644
--- a/src/transformers/models/timesformer/modeling_timesformer.py
+++ b/src/transformers/models/timesformer/modeling_timesformer.py
@@ -305,7 +305,7 @@ def __init__(self, config: TimesformerConfig, layer_index: int) -> None:
         ]  # stochastic depth decay rule
         drop_path_rate = drop_path_rates[layer_index]
 
-        self.drop_path = TimeSformerDropPath(config.drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.drop_path = TimeSformerDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
         self.attention = TimeSformerAttention(config)
         self.intermediate = TimesformerIntermediate(config)
         self.output = TimesformerOutput(config)
diff --git a/src/transformers/models/tvp/__init__.py b/src/transformers/models/tvp/__init__.py
new file mode 100644
index 00000000000000..63c0bd27174471
--- /dev/null
+++ b/src/transformers/models/tvp/__init__.py
@@ -0,0 +1,80 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
+
+
+_import_structure = {
+    "configuration_tvp": [
+        "TVP_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "TvpConfig",
+    ],
+    "processing_tvp": ["TvpProcessor"],
+}
+
+try:
+    if not is_vision_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["image_processing_tvp"] = ["TvpImageProcessor"]
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tvp"] = [
+        "TVP_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TvpModel",
+        "TvpPreTrainedModel",
+        "TvpForVideoGrounding",
+    ]
+
+if TYPE_CHECKING:
+    from .configuration_tvp import (
+        TVP_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TvpConfig,
+    )
+    from .processing_tvp import TvpProcessor
+
+    try:
+        if not is_vision_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .image_processing_tvp import TvpImageProcessor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tvp import (
+            TVP_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TvpForVideoGrounding,
+            TvpModel,
+            TvpPreTrainedModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/tvp/configuration_tvp.py b/src/transformers/models/tvp/configuration_tvp.py
new file mode 100644
index 00000000000000..dfb0a5f9985a9e
--- /dev/null
+++ b/src/transformers/models/tvp/configuration_tvp.py
@@ -0,0 +1,175 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TVP model configuration"""
+
+import copy
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+TVP_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "Intel/tvp-base": "https://huggingface.co/Intel/tvp-base/resolve/main/config.json",
+}
+
+
+class TvpConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TvpModel`]. It is used to instantiate an Tvp
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Tvp
+    [Intel/tvp-base](https://huggingface.co/Intel/tvp-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*):
+            The configuration of the backbone model.
+        distance_loss_weight (`float`, *optional*, defaults to 1.0):
+            The weight of distance loss.
+        duration_loss_weight (`float`, *optional*, defaults to 0.1):
+            The weight of duration loss.
+        visual_prompter_type (`str`, *optional*, defaults to `"framepad"`):
+            Visual prompt type. The type of padding. Framepad means padding on each frame. Should be one of "framepad"
+            or "framedownpad"
+        visual_prompter_apply (`str`, *optional*, defaults to `"replace"`):
+            The way of applying visual prompt. Replace means use the value of prompt to change the original value in
+            visual inputs. Should be one of "replace", or "add", or "remove".
+        visual_prompt_size (`int`, *optional*, defaults to 96):
+            The size of visual prompt.
+        max_img_size (`int`, *optional*, defaults to 448):
+            The maximum size of frame.
+        num_frames (`int`, *optional*, defaults to 48):
+            The number of frames extracted from a video.
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Tvp text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`TvpModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        max_grid_col_position_embeddings (`int`, *optional*, defaults to 100):
+            The largest number of horizontal patches from a video frame.
+        max_grid_row_position_embeddings (`int`, *optional*, defaults to 100):
+            The largest number of vertical patches from a video frame.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability of hidden layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability of attention layers.
+    """
+
+    model_type = "tvp"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        distance_loss_weight=1.0,
+        duration_loss_weight=0.1,
+        visual_prompter_type="framepad",
+        visual_prompter_apply="replace",
+        visual_prompt_size=96,
+        max_img_size=448,
+        num_frames=48,
+        vocab_size=30522,
+        hidden_size=768,
+        intermediate_size=3072,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        max_position_embeddings=512,
+        max_grid_col_position_embeddings=100,
+        max_grid_row_position_embeddings=100,
+        hidden_dropout_prob=0.1,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        initializer_range=0.02,
+        attention_probs_dropout_prob=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if backbone_config is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage4"])
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.get("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        self.backbone_config = backbone_config
+        self.distance_loss_weight = distance_loss_weight
+        self.duration_loss_weight = duration_loss_weight
+        self.visual_prompter_type = visual_prompter_type
+        self.visual_prompter_apply = visual_prompter_apply
+        self.visual_prompt_size = visual_prompt_size
+        self.max_img_size = max_img_size
+        self.num_frames = num_frames
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
+        self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+
+    @classmethod
+    def from_backbone_config(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`TvpConfig`] (or a derived class) from a pre-trained backbone model configuration.
+
+        Args:
+            backbone_config ([`PretrainedConfig`]):
+                The backbone configuration.
+        Returns:
+            [`TvpConfig`]: An instance of a configuration object
+        """
+        return cls(backbone_config=backbone_config, **kwargs)
+
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
+
+        Returns:
+            `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
+        """
+        output = copy.deepcopy(self.__dict__)
+        if output["backbone_config"] is not None:
+            output["backbone_config"] = self.backbone_config.to_dict()
+        output["model_type"] = self.__class__.model_type
+        return output
diff --git a/src/transformers/models/tvp/image_processing_tvp.py b/src/transformers/models/tvp/image_processing_tvp.py
new file mode 100644
index 00000000000000..5363d504319520
--- /dev/null
+++ b/src/transformers/models/tvp/image_processing_tvp.py
@@ -0,0 +1,476 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVP."""
+
+from typing import Dict, Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    PaddingMode,
+    flip_channel_order,
+    pad,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.vivit.image_processing_vivit.make_batched
+def make_batched(videos) -> List[List[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)) and is_valid_image(videos[0][0]):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        return [videos]
+
+    elif is_valid_image(videos):
+        return [[videos]]
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    max_size: int = 448,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> Tuple[int, int]:
+    height, width = get_image_size(input_image, input_data_format)
+    if height >= width:
+        ratio = width * 1.0 / height
+        new_height = max_size
+        new_width = new_height * ratio
+    else:
+        ratio = height * 1.0 / width
+        new_width = max_size
+        new_height = new_width * ratio
+    size = (int(new_height), int(new_width))
+
+    return size
+
+
+class TvpImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Tvp image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`Dict[str, int]` *optional*, defaults to `{"longest_edge": 448}`):
+            Size of the output image after resizing. The longest edge of the image will be resized to
+            `size["longest_edge"]` while maintaining the aspect ratio of the original image. Can be overriden by
+            `size` in the `preprocess` method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` method.
+        pad_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+            Size of the image after applying the padding. Can be overridden by the `pad_size` parameter in the
+            `preprocess` method.
+        constant_values (`Union[float, Iterable[float]]`, *optional*, defaults to 0):
+            The fill value to use when padding the image.
+        pad_mode (`PaddingMode`, *optional*, defaults to `PaddingMode.CONSTANT`):
+            Use what kind of mode in padding.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        do_flip_channel_order (`bool`, *optional*, defaults to `True`):
+            Whether to flip the color channels from RGB to BGR. Can be overridden by the `do_flip_channel_order`
+            parameter in the `preprocess` method.
+        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_pad: bool = True,
+        pad_size: Dict[str, int] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        pad_mode: PaddingMode = PaddingMode.CONSTANT,
+        do_normalize: bool = True,
+        do_flip_channel_order: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"longest_edge": 448}
+        crop_size = crop_size if crop_size is not None else {"height": 448, "width": 448}
+        pad_size = pad_size if pad_size is not None else {"height": 448, "width": 448}
+
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self.constant_values = constant_values
+        self.pad_mode = pad_mode
+        self.do_normalize = do_normalize
+        self.do_flip_channel_order = do_flip_channel_order
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`Dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"longest_edge": s}`, the output image will have its
+                longest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        elif "longest_edge" in size:
+            output_size = get_resize_output_image_size(image, size["longest_edge"], input_data_format)
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'longest_edge' as keys. Got {size.keys()}")
+
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def pad_image(
+        self,
+        image: np.ndarray,
+        pad_size: Dict[str, int] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        pad_mode: PaddingMode = PaddingMode.CONSTANT,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ):
+        """
+        Pad an image with zeros to the given size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to pad.
+            pad_size (`Dict[str, int]`)
+                Size of the output image with pad.
+            constant_values (`Union[float, Iterable[float]]`)
+                The fill value to use when padding the image.
+            pad_mode (`PaddingMode`)
+                The pad mode, default to PaddingMode.CONSTANT
+            data_format (`ChannelDimension` or `str`, *optional*)
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        max_height = pad_size.get("height", height)
+        max_width = pad_size.get("width", width)
+
+        pad_right, pad_bottom = max_width - width, max_height - height
+        if pad_right < 0 or pad_bottom < 0:
+            raise ValueError("The padding size must be greater than image size")
+
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=pad_mode,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+
+        return padded_image
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_pad: bool = True,
+        pad_size: Dict[str, int] = None,
+        constant_values: Union[float, Iterable[float]] = None,
+        pad_mode: PaddingMode = None,
+        do_normalize: bool = None,
+        do_flip_channel_order: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+        if do_resize and size is None or resample is None:
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_center_crop and crop_size is None:
+            raise ValueError("Crop size must be specified if do_center_crop is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_pad and pad_size is None:
+            raise ValueError("Padding size must be specified if do_pad is True.")
+
+        if do_normalize and (image_mean is None or image_std is None):
+            raise ValueError("Image mean and std must be specified if do_normalize is True.")
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(
+                image=image.astype(np.float32), mean=image_mean, std=image_std, input_data_format=input_data_format
+            )
+
+        if do_pad:
+            image = self.pad_image(
+                image=image,
+                pad_size=pad_size,
+                constant_values=constant_values,
+                pad_mode=pad_mode,
+                input_data_format=input_data_format,
+            )
+
+        # the pretrained checkpoints assume images are BGR, not RGB
+        if do_flip_channel_order:
+            image = flip_channel_order(image=image, input_data_format=input_data_format)
+
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+        return image
+
+    def preprocess(
+        self,
+        videos: Union[ImageInput, List[ImageInput], List[List[ImageInput]]],
+        do_resize: bool = None,
+        size: Dict[str, int] = None,
+        resample: PILImageResampling = None,
+        do_center_crop: bool = None,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = None,
+        rescale_factor: float = None,
+        do_pad: bool = None,
+        pad_size: Dict[str, int] = None,
+        constant_values: Union[float, Iterable[float]] = None,
+        pad_mode: PaddingMode = None,
+        do_normalize: bool = None,
+        do_flip_channel_order: bool = None,
+        image_mean: Optional[Union[float, List[float]]] = None,
+        image_std: Optional[Union[float, List[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            videos (`ImageInput` or `List[ImageInput]` or `List[List[ImageInput]]`):
+                Frames to preprocess.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`Dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_pad (`bool`, *optional*, defaults to `True`):
+                Whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess` method.
+            pad_size (`Dict[str, int]`, *optional*, defaults to `{"height": 448, "width": 448}`):
+                Size of the image after applying the padding. Can be overridden by the `pad_size` parameter in the
+                `preprocess` method.
+            constant_values (`Union[float, Iterable[float]]`, *optional*, defaults to 0):
+                The fill value to use when padding the image.
+            pad_mode (`PaddingMode`, *optional*, defaults to "PaddingMode.CONSTANT"):
+                Use what kind of mode in padding.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            do_flip_channel_order (`bool`, *optional*, defaults to `self.do_flip_channel_order`):
+                Whether to flip the channel order of the image.
+            image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        constant_values = constant_values if constant_values is not None else self.constant_values
+        pad_mode = pad_mode if pad_mode else self.pad_mode
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_flip_channel_order = (
+            do_flip_channel_order if do_flip_channel_order is not None else self.do_flip_channel_order
+        )
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        videos = [
+            np.array(
+                [
+                    self._preprocess_image(
+                        image=img,
+                        do_resize=do_resize,
+                        size=size,
+                        resample=resample,
+                        do_center_crop=do_center_crop,
+                        crop_size=crop_size,
+                        do_rescale=do_rescale,
+                        rescale_factor=rescale_factor,
+                        do_pad=do_pad,
+                        pad_size=pad_size,
+                        constant_values=constant_values,
+                        pad_mode=pad_mode,
+                        do_normalize=do_normalize,
+                        do_flip_channel_order=do_flip_channel_order,
+                        image_mean=image_mean,
+                        image_std=image_std,
+                        data_format=data_format,
+                        input_data_format=input_data_format,
+                    )
+                    for img in video
+                ]
+            )
+            for video in videos
+        ]
+
+        data = {"pixel_values": videos}
+        return BatchFeature(data=data, tensor_type=return_tensors)
diff --git a/src/transformers/models/tvp/modeling_tvp.py b/src/transformers/models/tvp/modeling_tvp.py
new file mode 100644
index 00000000000000..65fac8b3a0e0ce
--- /dev/null
+++ b/src/transformers/models/tvp/modeling_tvp.py
@@ -0,0 +1,895 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TVP Model"""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...activations import ACT2FN
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward, replace_return_docstrings
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import prune_linear_layer
+from ...utils import logging
+from ..auto import AutoBackbone
+from .configuration_tvp import TvpConfig
+
+
+logger = logging.get_logger(__name__)
+
+TVP_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "Intel/tvp-base",
+    "Intel/tvp-base-ANet",
+    # See all Tvp models at https://huggingface.co/models?filter=tvp
+]
+
+
+@dataclass
+class TvpVideoGroundingOutput(ModelOutput):
+    """
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Temporal-Distance IoU loss for video grounding.
+        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
+            input texts.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
+            the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class TvpLoss(nn.Module):
+    """
+    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
+    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
+    ground-truth / prediction (supervise class and box).
+
+    Args:
+        losses (`List[str]`):
+            List of all the losses to be applied.
+    """
+
+    def __init__(self, losses):
+        super().__init__()
+        self.loss_map = {
+            "iou": self.loss_iou,
+            "distance": self.loss_distance,
+            "duration": self.loss_duration,
+        }
+        for loss in losses:
+            if loss not in self.loss_map:
+                raise ValueError(f"Loss {loss} not supported")
+
+        self.losses = losses
+
+    def loss_iou(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+        """
+        Measure the intersection over union.
+        """
+        inter = torch.min(candidates_end_time, end_time) - torch.max(candidates_start_time, start_time)
+        union = torch.max(candidates_end_time, end_time) - torch.min(candidates_start_time, start_time)
+        iou = 1 - inter.clamp(min=0) / union
+
+        return iou
+
+    def loss_distance(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+        """
+        Measure the distance of mid points.
+        """
+        mid_candidates = torch.div(torch.add(candidates_start_time, candidates_end_time), 2.0)
+        mid_groundtruth = torch.div(torch.add(start_time, end_time), 2.0)
+        distance_diff = torch.div(
+            torch.max(mid_candidates, mid_groundtruth) - torch.min(mid_candidates, mid_groundtruth), duration
+        ).clamp(min=0.2)
+
+        return distance_diff
+
+    def loss_duration(self, start_time, end_time, candidates_start_time, candidates_end_time, duration):
+        """
+        Measure the difference of duration.
+        """
+        duration_candidates = torch.sub(candidates_end_time, candidates_start_time)
+        duration_groundtruth = torch.sub(end_time, start_time)
+        duration_diff = torch.square(torch.div(torch.sub(duration_candidates, duration_groundtruth), duration))
+        duration_diff = duration_diff.clamp(min=0.4)
+
+        return duration_diff
+
+    def forward(self, logits, labels):
+        """
+        This performs the loss computation.
+
+        Args:
+            logits (`torch.FloatTensor`):
+                The output logits of head module.
+            labels (`List[torch.FloatTensor]`):
+                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
+        """
+        duration, start_time, end_time = labels
+        candidates = torch.mul(logits, duration)
+        candidates_start_time, candidates_end_time = candidates[:, 0].float(), candidates[:, 1].float()
+
+        losses_dict = {}
+        for loss in self.losses:
+            losses_dict.update(
+                {loss: self.loss_map[loss](start_time, end_time, candidates_start_time, candidates_end_time, duration)}
+            )
+
+        return losses_dict
+
+
+class TvpVisionModel(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.backbone = AutoBackbone.from_config(config.backbone_config)
+        self.grid_encoder_conv = nn.Conv2d(
+            config.backbone_config.hidden_sizes[-1],
+            config.hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            groups=1,
+            bias=False,
+        )
+
+    def forward(self, pixel_values):
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        # (batch_size * num_frames, num_channels, height, width)
+        pixel_values = pixel_values.view(batch_size * num_frames, num_channels, height, width)
+        grid_feat_outputs = self.backbone(pixel_values)["feature_maps"][0]
+        grid = self.grid_encoder_conv(grid_feat_outputs)
+        grid = nn.functional.max_pool2d(grid, kernel_size=2, stride=2)
+        grid = nn.functional.relu(grid, inplace=True)
+        new_channel, new_height, new_width = grid.shape[-3:]
+        # (batch_size, num_frames, num_channels, height, width)
+        grid = grid.view(batch_size, num_frames, new_channel, new_height, new_width)
+        # (batch_size, num_frames, height, width, num_channels)
+        grid = grid.permute(0, 1, 3, 4, 2)
+        return grid
+
+
+class TvpVisualInputEmbedding(nn.Module):
+    """
+    Takes input of both image and video (multi-frame)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        # sequence embedding
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.row_position_embeddings = nn.Embedding(config.max_grid_row_position_embeddings, config.hidden_size)
+        self.col_position_embeddings = nn.Embedding(config.max_grid_col_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(1, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def add_2d_positional_embeddings(self, grid):
+        """
+        Args:
+            grid: (batch_size, height, width, hidden_dim)
+        Returns:
+            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
+        """
+        batch_size, height, width, hidden_dim = grid.shape
+
+        # add row-wise position embeddings
+        row_position_ids = torch.arange(height, dtype=torch.long, device=grid.device)  # (height, )
+        row_position_embeddings = self.row_position_embeddings(row_position_ids)  # (height, hidden_dim)
+        row_shape = (1,) * (len(grid.shape) - 3) + (height, 1, hidden_dim)  # (1, height, 1, hidden_dim)
+        grid = grid + row_position_embeddings.view(*row_shape)  # broadcast automatically
+
+        # add column-wise position embeddings
+        col_position_ids = torch.arange(width, dtype=torch.long, device=grid.device)  # (width, )
+        col_position_embeddings = self.col_position_embeddings(col_position_ids)  # (width, hidden_dim)
+        col_shape = (batch_size, 1, width, hidden_dim)  # (1, 1, width, hidden_dim)
+        return grid + col_position_embeddings.view(*col_shape)  # broadcast automatically
+
+    def forward(self, grid):
+        """
+        Args:
+            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
+                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
+                num_frames can be 1
+
+        Returns:
+            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)
+
+        """
+        batch_size, num_frames, height, width, num_channels = grid.shape
+        # temporal mean pooling, (batch_size, height, width, hidden_size)
+        grid = grid.mean(1)
+        grid = self.add_2d_positional_embeddings(grid)
+        # image token sequence, (batch_size, height*width, num_channels)
+        visual_tokens = grid.view(batch_size, -1, num_channels)
+        visual_tokens_shape = visual_tokens.shape[:-1]
+        device = visual_tokens.device
+
+        # image token type embeddings.
+        token_type_ids = torch.zeros(visual_tokens_shape, dtype=torch.long, device=device)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = visual_tokens + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class TvpTextInputEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(input_shape)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + position_embeddings + token_type_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class TvpAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention heads {config.num_attention_heads}"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+        self.attn_dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.num_attention_heads, self.attention_head_size)
+        heads = set(heads) - self.pruned_heads  # Convert to set and remove already pruned heads
+        for head in heads:
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+
+        # Prune linear layers
+        self.query = prune_linear_layer(self.query, index)
+        self.key = prune_linear_layer(self.key, index)
+        self.value = prune_linear_layer(self.value, index)
+        self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.num_attention_heads = self.num_attention_heads - len(heads)
+        self.all_head_size = self.attention_head_size * self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _reshape(self, tensor: torch.Tensor, sequence_length: int, batch_size: int):
+        return (
+            tensor.view(batch_size, sequence_length, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+            .contiguous()
+        )
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions: Optional[bool] = None,
+    ):
+        batch_size, sequence_length = hidden_states.shape[:2]
+        mixed_query_layer = self.query(hidden_states)
+
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self._reshape(mixed_query_layer, sequence_length, batch_size)
+        key_layer = self._reshape(mixed_key_layer, sequence_length, batch_size)
+        value_layer = self._reshape(mixed_value_layer, sequence_length, batch_size)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.attn_dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        attn_output = torch.matmul(attention_probs, value_layer)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, sequence_length, self.all_head_size)
+
+        attn_output = self.dense(attn_output)
+        attn_output = self.dropout(attn_output)
+        attn_output = self.layer_norm(attn_output + hidden_states)
+        # add attentions if we output them
+        outputs = (attn_output, attention_probs) if output_attentions else (attn_output,)
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Tvp
+class TvpIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class TvpOutputLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.layer_norm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class TvpEncodeLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = TvpAttention(config)
+        self.intermediate = TvpIntermediate(config)
+        self.output = TvpOutputLayer(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions: Optional[bool] = None,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + outputs
+        return outputs
+
+
+class TvpEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([TvpEncodeLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        all_hidden_states = ()
+        all_attentions = ()
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer_module.__call__,
+                    hidden_states,
+                    attention_mask,
+                    (head_mask[i] if head_mask is not None else None),
+                    output_attentions,
+                )
+            else:
+                layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], output_attentions)
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        # Add last layer
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            outputs = (hidden_states,)
+            if output_hidden_states:
+                outputs = outputs + (all_hidden_states,)
+            if output_attentions:
+                outputs = outputs + (all_attentions,)
+            return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states if output_hidden_states else None,
+            attentions=all_attentions if output_attentions else None,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Tvp
+class TvpPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class TvpPreTrainedModel(PreTrainedModel):
+    """An abstract class to handle weights initialization and
+    a simple interface for downloading and loading pretrained models.
+    """
+
+    config_class = TvpConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+
+
+TVP_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TvpConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TVP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
+            IDs?](../glossary#input-ids)
+
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`TvpImageProcessor`]. See [`TvpImageProcessor.__call__`]
+            for details.
+
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class TvpFrameDownPadPrompter(nn.Module):
+    """
+    Pad frames extracted from videos only at the bottom.
+    """
+
+    def __init__(self, config):
+        if config.visual_prompter_apply not in ("add", "replace", "remove"):
+            raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
+
+        super().__init__()
+        self.visual_prompt_size = config.visual_prompt_size
+        self.frame_num = config.frame_num
+        self.max_img_size = config.max_img_size
+        self.visual_prompter_apply = config.visual_prompter_apply
+
+        self.pad_down = nn.Parameter(
+            torch.randn([1, config.frame_num, 3, config.visual_prompt_size, config.max_img_size])
+        )
+
+    def forward(self, pixel_values):
+        if self.visual_prompter_apply != "add":
+            visual_prompt_mask = torch.ones(
+                [self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
+            )
+            visual_prompt_mask[self.max_img_size - self.visual_prompt_size : self.max_img_size, :] = 0.0
+            pixel_values *= visual_prompt_mask
+        if self.visual_prompter_apply != "remove":
+            prompt = torch.zeros(
+                [pixel_values.shape[0], pixel_values.shape[1], 3, self.max_img_size, self.max_img_size],
+                device=pixel_values.device,
+            )
+            start_point = self.max_img_size - self.visual_prompt_size
+            prompt[:, :, :, start_point : self.max_img_size, :] = self.pad_down
+            pixel_values += prompt.to(pixel_values.dtype)
+        return pixel_values
+
+
+class TvpFramePadPrompter(nn.Module):
+    """
+    Pad frames extracted from videos in the surroundings.
+    """
+
+    def __init__(self, config):
+        if config.visual_prompter_apply not in ("add", "replace", "remove"):
+            raise ValueError("`visual_prompter_apply` must be in (add, replace, remove)")
+
+        super().__init__()
+        self.num_frames = config.num_frames
+        self.max_img_size = config.max_img_size
+        self.visual_prompter_apply = config.visual_prompter_apply
+
+        self.base_size = config.max_img_size - config.visual_prompt_size * 2
+        self.pad_up = nn.Parameter(
+            torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
+        )
+        self.pad_down = nn.Parameter(
+            torch.randn([1, config.num_frames, 3, config.visual_prompt_size, config.max_img_size])
+        )
+        self.pad_left = nn.Parameter(
+            torch.randn(
+                [
+                    1,
+                    config.num_frames,
+                    3,
+                    config.max_img_size - config.visual_prompt_size * 2,
+                    config.visual_prompt_size,
+                ]
+            )
+        )
+        self.pad_right = nn.Parameter(
+            torch.randn(
+                [
+                    1,
+                    config.num_frames,
+                    3,
+                    config.max_img_size - config.visual_prompt_size * 2,
+                    config.visual_prompt_size,
+                ]
+            )
+        )
+
+    def forward(self, pixel_values):
+        if self.visual_prompter_apply not in ("add", "remove", "replace"):
+            raise ValueError(f"Invalid visual_prompter_apply value {self.visual_prompter_apply}")
+        if self.visual_prompter_apply in ("replace", "remove"):
+            visual_prompt_mask = torch.ones(
+                [self.max_img_size, self.max_img_size], dtype=pixel_values.dtype, device=pixel_values.device
+            )
+            pixel_values *= visual_prompt_mask
+        if self.visual_prompter_apply in ("replace", "add"):
+            base = torch.zeros(1, self.num_frames, 3, self.base_size, self.base_size, device=pixel_values.device)
+            prompt = torch.cat([self.pad_left, base, self.pad_right], dim=4)
+            prompt = torch.cat([self.pad_up, prompt, self.pad_down], dim=3)
+            prompt = torch.cat(pixel_values.size(0) * [prompt])
+            pixel_values += prompt.to(pixel_values.dtype)
+        return pixel_values
+
+
+TVP_PROMPTER_CLASSES_MAPPING = {
+    "framedownpad": TvpFrameDownPadPrompter,
+    "framepad": TvpFramePadPrompter,
+}
+
+
+@add_start_docstrings(
+    "The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on" " top.",
+    TVP_START_DOCSTRING,
+)
+class TvpModel(TvpPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.vision_model = TvpVisionModel(config)
+        self.embeddings = TvpTextInputEmbeddings(config)
+        self.visual_embeddings = TvpVisualInputEmbedding(config)
+        self.encoder = TvpEncoder(config)
+        self.pooler = TvpPooler(config)
+        self.text_prompt = nn.Parameter(torch.randn([1, 10, config.hidden_size]))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        if config.visual_prompter_type not in TVP_PROMPTER_CLASSES_MAPPING:
+            raise ValueError("`visual_prompter_type` must be in (framedownpad, framepad)")
+        self.visual_prompter = TVP_PROMPTER_CLASSES_MAPPING[config.visual_prompter_type](config)
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=TvpConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel
+
+        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")
+
+        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
+        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
+        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # Add visual prompt, it compensates for the spatiotemporal information loss in 2D visual features.
+        pixel_values = self.vision_model(self.visual_prompter(pixel_values))
+        # (batch_size, sequence_length, hidden_size)
+        text_embedding_output = self.embeddings(input_ids=input_ids)
+        # (batch_size, visual_sequence_length, hidden_size)
+        visual_embedding_output = self.visual_embeddings(pixel_values)
+        if attention_mask is not None:
+            # (batch_size, visual_sequence_length)
+            visual_attention_mask = attention_mask.new_ones(visual_embedding_output.shape[:2])
+            pt_mask = torch.ones(attention_mask.shape[0], 10).to(
+                device=attention_mask.device, dtype=attention_mask.dtype
+            )
+            attention_mask = torch.cat([pt_mask, attention_mask, visual_attention_mask], dim=-1)
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.size()).to(input_ids.device)
+        text_prompt = self.text_prompt.expand(text_embedding_output.shape[0], -1, -1)
+        # (batch_size, sequence_length + visual_sequence_length, hidden_size)
+        embedding_output = torch.cat([text_prompt, text_embedding_output, visual_embedding_output], dim=1)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=attention_mask,
+            head_mask=self.get_head_mask(head_mask, self.config.num_hidden_layers),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state if return_dict else encoder_outputs[0]
+        pooled_output = self.pooler(last_hidden_state)
+        last_hidden_state = self.dropout(last_hidden_state)
+        pooled_output = self.dropout(pooled_output)
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TvpVideoGroundingHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_0 = nn.Linear(config.hidden_size, config.hidden_size * 2)
+        self.layer_1 = nn.Linear(config.hidden_size * 2, 2)
+        self.activation_0 = nn.ReLU()
+        self.activation_1 = nn.Sigmoid()
+
+    def forward(self, pooler_output):
+        logits = self.activation_0(self.layer_0(pooler_output))
+        logits = self.activation_1(self.layer_1(logits))
+        return logits
+
+
+@add_start_docstrings(
+    """
+    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
+    """,
+    TVP_START_DOCSTRING,
+)
+class TvpForVideoGrounding(TvpPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.model = TvpModel(config)
+        self.video_grounding_head = TvpVideoGroundingHead(config)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TVP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvpVideoGroundingOutput, config_class=TvpConfig)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        labels: Tuple[torch.Tensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
+            The labels contains duration, start time, and end time of the video corresponding to the text.
+        Returns:
+
+        Examples:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding
+
+        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")
+
+        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
+        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
+        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        outputs = self.model(
+            input_ids,
+            pixel_values,
+            attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooler_output = outputs[1]
+
+        logits = self.video_grounding_head(pooler_output)
+
+        loss = None
+        if labels is not None:
+            criterion = TvpLoss(["iou", "distance", "duration"])
+            criterion.to(self.device)
+            loss_dict = criterion(logits, labels)
+            loss = (
+                loss_dict["iou"]
+                + self.config.distance_loss_weight * loss_dict["distance"]
+                + self.config.duration_loss_weight * loss_dict["duration"]
+            )
+
+        if not return_dict:
+            outputs = (logits,) + outputs[2:]
+            if loss is not None:
+                outputs = (loss,) + outputs
+            return outputs
+
+        return TvpVideoGroundingOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
diff --git a/src/transformers/models/tvp/processing_tvp.py b/src/transformers/models/tvp/processing_tvp.py
new file mode 100644
index 00000000000000..4e27399ab8053f
--- /dev/null
+++ b/src/transformers/models/tvp/processing_tvp.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2023 The Intel AIA Team Authors, and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License=, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing=, software
+# distributed under the License is distributed on an "AS IS" BASIS=,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVP.
+"""
+
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding
+
+
+class TvpProcessor(ProcessorMixin):
+    r"""
+    Constructs an TVP processor which wraps a TVP image processor and a Bert tokenizer into a single processor.
+
+    [`TvpProcessor`] offers all the functionalities of [`TvpImageProcessor`] and [`BertTokenizerFast`]. See the
+    [`~TvpProcessor.__call__`] and [`~TvpProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`TvpImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`BertTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "TvpImageProcessor"
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if image_processor is None:
+            raise ValueError("You need to specify an `image_processor`.")
+        if tokenizer is None:
+            raise ValueError("You need to specify a `tokenizer`.")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, videos=None, return_tensors=None, **kwargs):
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `videos` and `kwargs` arguments to
+        TvpImageProcessor's [`~TvpImageProcessor.__call__`] if `videos` is not `None`. Please refer to the doctsring of
+        the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            videos (`List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, `List[List[PIL.Image.Image]]`, `List[List[np.ndarrray]]`,:
+                `List[List[torch.Tensor]]`): The video or batch of videos to be prepared. Each video should be a list
+                of frames, which can be either PIL images or NumPy arrays. In case of NumPy arrays/PyTorch tensors,
+                each frame should be of shape (H, W, C), where H and W are frame height and width, and C is a number of
+                channels.
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `videos` is not `None`.
+        """
+
+        max_text_length = kwargs.pop("max_text_length", None)
+
+        if text is None and videos is None:
+            raise ValueError("You have to specify either text or videos. Both cannot be none.")
+
+        encoding = {}
+        if text is not None:
+            textual_input = self.tokenizer.batch_encode_plus(
+                text,
+                truncation=True,
+                padding="max_length",
+                max_length=max_text_length,
+                pad_to_max_length=True,
+                return_tensors=return_tensors,
+                return_token_type_ids=False,
+                **kwargs,
+            )
+            encoding.update(textual_input)
+
+        if videos is not None:
+            image_features = self.image_processor(videos, return_tensors=return_tensors, **kwargs)
+            encoding.update(image_features)
+
+        return BatchEncoding(data=encoding, tensor_type=return_tensors)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    def post_process_video_grounding(self, logits, video_durations):
+        """
+        Compute the time of the video.
+
+        Args:
+            logits (`torch.Tensor`):
+                The logits output of TvpForVideoGrounding.
+            video_durations (`float`):
+                The video's duration.
+
+        Returns:
+            start (`float`):
+                The start time of the video.
+            end (`float`):
+                The end time of the video.
+        """
+        start, end = (
+            round(logits.tolist()[0][0] * video_durations, 1),
+            round(logits.tolist()[0][1] * video_durations, 1),
+        )
+
+        return start, end
+
+    @property
+    # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
diff --git a/src/transformers/models/univnet/__init__.py b/src/transformers/models/univnet/__init__.py
new file mode 100644
index 00000000000000..afb03ee9894b0e
--- /dev/null
+++ b/src/transformers/models/univnet/__init__.py
@@ -0,0 +1,65 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING
+
+from ...utils import (
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    is_torch_available,
+)
+
+
+_import_structure = {
+    "configuration_univnet": [
+        "UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP",
+        "UnivNetConfig",
+    ],
+    "feature_extraction_univnet": ["UnivNetFeatureExtractor"],
+}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_univnet"] = [
+        "UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "UnivNetModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_univnet import (
+        UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        UnivNetConfig,
+    )
+    from .feature_extraction_univnet import UnivNetFeatureExtractor
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_univnet import (
+            UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST,
+            UnivNetModel,
+        )
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
diff --git a/src/transformers/models/univnet/configuration_univnet.py b/src/transformers/models/univnet/configuration_univnet.py
new file mode 100644
index 00000000000000..c9dbbb5328210e
--- /dev/null
+++ b/src/transformers/models/univnet/configuration_univnet.py
@@ -0,0 +1,127 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" UnivNetModel model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+UNIVNET_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "dg845/univnet-dev": "https://huggingface.co/dg845/univnet-dev/resolve/main/config.json",
+}
+
+
+class UnivNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`UnivNetModel`]. It is used to instantiate a
+    UnivNet vocoder model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the UnivNet
+    [dg845/univnet-dev](https://huggingface.co/dg845/univnet-dev) architecture, which corresponds to the 'c32'
+    architecture in [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/master/config/default_c32.yaml).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        model_in_channels (`int`, *optional*, defaults to 64):
+            The number of input channels for the UnivNet residual network. This should correspond to
+            `noise_sequence.shape[1]` and the value used in the [`UnivNetFeatureExtractor`] class.
+        model_hidden_channels (`int`, *optional*, defaults to 32):
+            The number of hidden channels of each residual block in the UnivNet residual network.
+        num_mel_bins (`int`, *optional*, defaults to 100):
+            The number of frequency bins in the conditioning log-mel spectrogram. This should correspond to the value
+            used in the [`UnivNetFeatureExtractor`] class.
+        resblock_kernel_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[3, 3, 3]`):
+            A tuple of integers defining the kernel sizes of the 1D convolutional layers in the UnivNet residual
+            network. The length of `resblock_kernel_sizes` defines the number of resnet blocks and should match that of
+            `resblock_stride_sizes` and `resblock_dilation_sizes`.
+        resblock_stride_sizes (`Tuple[int]` or `List[int]`, *optional*, defaults to `[8, 8, 4]`):
+            A tuple of integers defining the stride sizes of the 1D convolutional layers in the UnivNet residual
+            network. The length of `resblock_stride_sizes` should match that of `resblock_kernel_sizes` and
+            `resblock_dilation_sizes`.
+        resblock_dilation_sizes (`Tuple[Tuple[int]]` or `List[List[int]]`, *optional*, defaults to `[[1, 3, 9, 27], [1, 3, 9, 27], [1, 3, 9, 27]]`):
+            A nested tuple of integers defining the dilation rates of the dilated 1D convolutional layers in the
+            UnivNet residual network. The length of `resblock_dilation_sizes` should match that of
+            `resblock_kernel_sizes` and `resblock_stride_sizes`. The length of each nested list in
+            `resblock_dilation_sizes` defines the number of convolutional layers per resnet block.
+        kernel_predictor_num_blocks (`int`, *optional*, defaults to 3):
+            The number of residual blocks in the kernel predictor network, which calculates the kernel and bias for
+            each location variable convolution layer in the UnivNet residual network.
+        kernel_predictor_hidden_channels (`int`, *optional*, defaults to 64):
+            The number of hidden channels for each residual block in the kernel predictor network.
+        kernel_predictor_conv_size (`int`, *optional*, defaults to 3):
+            The kernel size of each 1D convolutional layer in the kernel predictor network.
+        kernel_predictor_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for each residual block in the kernel predictor network.
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.2):
+            The angle of the negative slope used by the leaky ReLU activation.
+
+    Example:
+
+    ```python
+    >>> from transformers import UnivNetModel, UnivNetConfig
+
+    >>> # Initializing a Tortoise TTS style configuration
+    >>> configuration = UnivNetConfig()
+
+    >>> # Initializing a model (with random weights) from the Tortoise TTS style configuration
+    >>> model = UnivNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "univnet"
+
+    def __init__(
+        self,
+        model_in_channels=64,
+        model_hidden_channels=32,
+        num_mel_bins=100,
+        resblock_kernel_sizes=[3, 3, 3],
+        resblock_stride_sizes=[8, 8, 4],
+        resblock_dilation_sizes=[[1, 3, 9, 27], [1, 3, 9, 27], [1, 3, 9, 27]],
+        kernel_predictor_num_blocks=3,
+        kernel_predictor_hidden_channels=64,
+        kernel_predictor_conv_size=3,
+        kernel_predictor_dropout=0.0,
+        initializer_range=0.01,
+        leaky_relu_slope=0.2,
+        **kwargs,
+    ):
+        if not (len(resblock_kernel_sizes) == len(resblock_stride_sizes) == len(resblock_dilation_sizes)):
+            raise ValueError(
+                "`resblock_kernel_sizes`, `resblock_stride_sizes`, and `resblock_dilation_sizes` must all have the"
+                " same length (which will be the number of resnet blocks in the model)."
+            )
+
+        self.model_in_channels = model_in_channels
+        self.model_hidden_channels = model_hidden_channels
+        self.num_mel_bins = num_mel_bins
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_stride_sizes = resblock_stride_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.kernel_predictor_num_blocks = kernel_predictor_num_blocks
+        self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
+        self.kernel_predictor_conv_size = kernel_predictor_conv_size
+        self.kernel_predictor_dropout = kernel_predictor_dropout
+        self.initializer_range = initializer_range
+        self.leaky_relu_slope = leaky_relu_slope
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/univnet/convert_univnet.py b/src/transformers/models/univnet/convert_univnet.py
new file mode 100644
index 00000000000000..30520b7fa14725
--- /dev/null
+++ b/src/transformers/models/univnet/convert_univnet.py
@@ -0,0 +1,162 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+
+from transformers import UnivNetConfig, UnivNetModel, logging
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers.models.univnet")
+
+
+def get_kernel_predictor_key_mapping(config: UnivNetConfig, old_prefix: str = "", new_prefix: str = ""):
+    mapping = {}
+    # Initial conv layer
+    mapping[f"{old_prefix}.input_conv.0.weight_g"] = f"{new_prefix}.input_conv.weight_g"
+    mapping[f"{old_prefix}.input_conv.0.weight_v"] = f"{new_prefix}.input_conv.weight_v"
+    mapping[f"{old_prefix}.input_conv.0.bias"] = f"{new_prefix}.input_conv.bias"
+
+    # Kernel predictor resnet blocks
+    for i in range(config.kernel_predictor_num_blocks):
+        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_g"] = f"{new_prefix}.resblocks.{i}.conv1.weight_g"
+        mapping[f"{old_prefix}.residual_convs.{i}.1.weight_v"] = f"{new_prefix}.resblocks.{i}.conv1.weight_v"
+        mapping[f"{old_prefix}.residual_convs.{i}.1.bias"] = f"{new_prefix}.resblocks.{i}.conv1.bias"
+
+        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_g"] = f"{new_prefix}.resblocks.{i}.conv2.weight_g"
+        mapping[f"{old_prefix}.residual_convs.{i}.3.weight_v"] = f"{new_prefix}.resblocks.{i}.conv2.weight_v"
+        mapping[f"{old_prefix}.residual_convs.{i}.3.bias"] = f"{new_prefix}.resblocks.{i}.conv2.bias"
+
+    # Kernel output conv
+    mapping[f"{old_prefix}.kernel_conv.weight_g"] = f"{new_prefix}.kernel_conv.weight_g"
+    mapping[f"{old_prefix}.kernel_conv.weight_v"] = f"{new_prefix}.kernel_conv.weight_v"
+    mapping[f"{old_prefix}.kernel_conv.bias"] = f"{new_prefix}.kernel_conv.bias"
+
+    # Bias output conv
+    mapping[f"{old_prefix}.bias_conv.weight_g"] = f"{new_prefix}.bias_conv.weight_g"
+    mapping[f"{old_prefix}.bias_conv.weight_v"] = f"{new_prefix}.bias_conv.weight_v"
+    mapping[f"{old_prefix}.bias_conv.bias"] = f"{new_prefix}.bias_conv.bias"
+
+    return mapping
+
+
+def get_key_mapping(config: UnivNetConfig):
+    mapping = {}
+
+    # NOTE: inital conv layer keys are the same
+
+    # LVC Residual blocks
+    for i in range(len(config.resblock_stride_sizes)):
+        # LVCBlock initial convt layer
+        mapping[f"res_stack.{i}.convt_pre.1.weight_g"] = f"resblocks.{i}.convt_pre.weight_g"
+        mapping[f"res_stack.{i}.convt_pre.1.weight_v"] = f"resblocks.{i}.convt_pre.weight_v"
+        mapping[f"res_stack.{i}.convt_pre.1.bias"] = f"resblocks.{i}.convt_pre.bias"
+
+        # Kernel predictor
+        kernel_predictor_mapping = get_kernel_predictor_key_mapping(
+            config, old_prefix=f"res_stack.{i}.kernel_predictor", new_prefix=f"resblocks.{i}.kernel_predictor"
+        )
+        mapping.update(kernel_predictor_mapping)
+
+        # LVC Residual blocks
+        for j in range(len(config.resblock_dilation_sizes[i])):
+            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_g"] = f"resblocks.{i}.resblocks.{j}.conv.weight_g"
+            mapping[f"res_stack.{i}.conv_blocks.{j}.1.weight_v"] = f"resblocks.{i}.resblocks.{j}.conv.weight_v"
+            mapping[f"res_stack.{i}.conv_blocks.{j}.1.bias"] = f"resblocks.{i}.resblocks.{j}.conv.bias"
+
+    # Output conv layer
+    mapping["conv_post.1.weight_g"] = "conv_post.weight_g"
+    mapping["conv_post.1.weight_v"] = "conv_post.weight_v"
+    mapping["conv_post.1.bias"] = "conv_post.bias"
+
+    return mapping
+
+
+def rename_state_dict(state_dict, keys_to_modify, keys_to_remove):
+    model_state_dict = {}
+    for key, value in state_dict.items():
+        if key in keys_to_remove:
+            continue
+
+        if key in keys_to_modify:
+            new_key = keys_to_modify[key]
+            model_state_dict[new_key] = value
+        else:
+            model_state_dict[key] = value
+    return model_state_dict
+
+
+def convert_univnet_checkpoint(
+    checkpoint_path,
+    pytorch_dump_folder_path,
+    config_path=None,
+    repo_id=None,
+    safe_serialization=False,
+):
+    model_state_dict_base = torch.load(checkpoint_path, map_location="cpu")
+    # Get the generator's state dict
+    state_dict = model_state_dict_base["model_g"]
+
+    if config_path is not None:
+        config = UnivNetConfig.from_pretrained(config_path)
+    else:
+        config = UnivNetConfig()
+
+    keys_to_modify = get_key_mapping(config)
+    keys_to_remove = set()
+    hf_state_dict = rename_state_dict(state_dict, keys_to_modify, keys_to_remove)
+
+    model = UnivNetModel(config)
+    # Apply weight norm since the original checkpoint has weight norm applied
+    model.apply_weight_norm()
+    model.load_state_dict(hf_state_dict)
+    # Remove weight norm in preparation for inference
+    model.remove_weight_norm()
+
+    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
+
+    if repo_id:
+        print("Pushing to the hub...")
+        model.push_to_hub(repo_id)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", required=True, default=None, type=str, help="Path to original checkpoint")
+    parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert")
+    parser.add_argument(
+        "--pytorch_dump_folder_path", required=True, default=None, type=str, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--push_to_hub", default=None, type=str, help="Where to upload the converted model on the 🤗 hub."
+    )
+    parser.add_argument(
+        "--safe_serialization", action="store_true", help="Whether to save the model using `safetensors`."
+    )
+
+    args = parser.parse_args()
+
+    convert_univnet_checkpoint(
+        args.checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.config_path,
+        args.push_to_hub,
+        args.safe_serialization,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/univnet/feature_extraction_univnet.py b/src/transformers/models/univnet/feature_extraction_univnet.py
new file mode 100644
index 00000000000000..067aacc3d8c8ca
--- /dev/null
+++ b/src/transformers/models/univnet/feature_extraction_univnet.py
@@ -0,0 +1,456 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for UnivNetModel."""
+
+from typing import Any, Dict, List, Optional, Union
+
+import numpy as np
+
+from ...audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class UnivNetFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a UnivNet feature extractor.
+
+    This class extracts log-mel-filter bank features from raw speech using the short time Fourier Transform (STFT). The
+    STFT implementation follows that of TacoTron 2 and Hifi-GAN.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 24000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value to pad with when applying the padding strategy defined by the `padding` argument to
+            [`UnivNetFeatureExtractor.__call__`]. Should correspond to audio silence. The `pad_end` argument to
+            `__call__` will also use this padding value.
+        do_normalize (`bool`, *optional*, defaults to `False`):
+            Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve the
+            performance for some models.
+        num_mel_bins (`int`, *optional*, defaults to 100):
+            The number of mel-frequency bins in the extracted spectrogram features. This should match
+            `UnivNetModel.config.num_mel_bins`.
+        hop_length (`int`, *optional*, defaults to 256):
+            The direct number of samples between sliding windows. Otherwise referred to as "shift" in many papers. Note
+            that this is different from other audio feature extractors such as [`SpeechT5FeatureExtractor`] which take
+            the `hop_length` in ms.
+        win_length (`int`, *optional*, defaults to 1024):
+            The direct number of samples for each sliding window. Note that this is different from other audio feature
+            extractors such as [`SpeechT5FeatureExtractor`] which take the `win_length` in ms.
+        win_function (`str`, *optional*, defaults to `"hann_window"`):
+            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
+        filter_length (`int`, *optional*, defaults to 1024):
+            The number of FFT components to use. If `None`, this is determined using
+            `transformers.audio_utils.optimal_fft_length`.
+        max_length_s (`int`, *optional*, defaults to 10):
+            The maximum input lenght of the model in seconds. This is used to pad the audio.
+        fmin (`float`, *optional*, defaults to 0.0):
+            Minimum mel frequency in Hz.
+        fmax (`float`, *optional*):
+            Maximum mel frequency in Hz. If not set, defaults to `sampling_rate / 2`.
+        mel_floor (`float`, *optional*, defaults to 1e-09):
+            Minimum value of mel frequency banks. Note that the way [`UnivNetFeatureExtractor`] uses `mel_floor` is
+            different than in [`transformers.audio_utils.spectrogram`].
+        center (`bool`, *optional*, defaults to `False`):
+            Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame
+            `t` will start at time `t * hop_length`.
+        compression_factor (`float`, *optional*, defaults to 1.0):
+            The multiplicative compression factor for dynamic range compression during spectral normalization.
+        compression_clip_val (`float`, *optional*, defaults to 1e-05):
+            The clip value applied to the waveform before applying dynamic range compression during spectral
+            normalization.
+        normalize_min (`float`, *optional*, defaults to -11.512925148010254):
+            The min value used for Tacotron 2-style linear normalization. The default is the original value from the
+            Tacotron 2 implementation.
+        normalize_max (`float`, *optional*, defaults to 2.3143386840820312):
+            The max value used for Tacotron 2-style linear normalization. The default is the original value from the
+            Tacotron 2 implementation.
+        model_in_channels (`int`, *optional*, defaults to 64):
+            The number of input channels to the [`UnivNetModel`] model. This should match
+            `UnivNetModel.config.model_in_channels`.
+        pad_end_length (`int`, *optional*, defaults to 10):
+            If padding the end of each waveform, the number of spectrogram frames worth of samples to append. The
+            number of appended samples will be `pad_end_length * hop_length`.
+        return_attention_mask (`bool`, *optional*, defaults to `True`):
+            Whether or not [`~UnivNetFeatureExtractor.__call__`] should return `attention_mask`.
+    """
+
+    model_input_names = ["input_features", "noise_sequence", "padding_mask"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 24000,
+        padding_value: float = 0.0,
+        do_normalize: bool = False,
+        num_mel_bins: int = 100,
+        hop_length: int = 256,
+        win_length: int = 1024,
+        win_function: str = "hann_window",
+        filter_length: Optional[int] = 1024,
+        max_length_s: int = 10,
+        fmin: float = 0.0,
+        fmax: Optional[float] = None,
+        mel_floor: float = 1e-9,
+        center: bool = False,
+        compression_factor: float = 1.0,
+        compression_clip_val: float = 1e-5,
+        normalize_min: float = -11.512925148010254,
+        normalize_max: float = 2.3143386840820312,
+        model_in_channels: int = 64,
+        pad_end_length: int = 10,
+        return_attention_mask=True,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            return_attention_mask=return_attention_mask,
+            **kwargs,
+        )
+
+        self.do_normalize = do_normalize
+
+        self.num_mel_bins = num_mel_bins
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.win_function = win_function
+        self.filter_length = filter_length
+        self.fmin = fmin
+        if fmax is None:
+            # Follows the librosa.filters.mel implementation
+            fmax = float(sampling_rate) / 2
+        self.fmax = fmax
+        self.mel_floor = mel_floor
+
+        self.max_length_s = max_length_s
+        self.num_max_samples = max_length_s * sampling_rate
+
+        if self.filter_length is None:
+            self.n_fft = optimal_fft_length(self.win_length)
+        else:
+            self.n_fft = self.filter_length
+        self.n_freqs = (self.n_fft // 2) + 1
+
+        self.window = window_function(window_length=self.win_length, name=self.win_function, periodic=True)
+
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=self.fmin,
+            max_frequency=self.fmax,
+            sampling_rate=self.sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        )
+
+        self.center = center
+        self.compression_factor = compression_factor
+        self.compression_clip_val = compression_clip_val
+        self.normalize_min = normalize_min
+        self.normalize_max = normalize_max
+        self.model_in_channels = model_in_channels
+        self.pad_end_length = pad_end_length
+
+    def normalize(self, spectrogram):
+        return 2 * ((spectrogram - self.normalize_min) / (self.normalize_max - self.normalize_min)) - 1
+
+    def denormalize(self, spectrogram):
+        return self.normalize_min + (self.normalize_max - self.normalize_min) * ((spectrogram + 1) / 2)
+
+    def mel_spectrogram(self, waveform: np.ndarray) -> np.ndarray:
+        """
+        Calculates log MEL spectrograms from a batch of waveforms. Note that the input waveform(s) will be padded by
+        `int(self.n_fft - self.hop_length) / 2` on both sides using the `reflect` padding mode.
+
+        Args:
+            waveform (`np.ndarray` of shape `(length,)`):
+                The input waveform. This must be a single real-valued, mono waveform.
+
+        Returns:
+            `numpy.ndarray`: Array containing a log-mel spectrogram of shape `(num_frames, num_mel_bins)`.
+        """
+        # Do custom padding based on the official MelGAN and Hifi-GAN implementations
+        # See https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/utils/stft.py#L84-L86
+        waveform = np.pad(
+            waveform,
+            (int((self.n_fft - self.hop_length) / 2), int((self.n_fft - self.hop_length) / 2)),
+            mode="reflect",
+        )
+
+        # Get the complex spectrogram.
+        # Note: waveform must be unbatched currently due to the implementation of spectrogram(...).
+        complex_spectrogram = spectrogram(
+            waveform,
+            window=self.window,
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            fft_length=self.n_fft,
+            power=None,
+            center=self.center,
+            mel_filters=None,
+            mel_floor=None,
+        )
+
+        # Apply the MEL filter bank and MEL floor manually since UnivNet uses a slightly different implementation
+        amplitude_spectrogram = np.sqrt(
+            np.real(complex_spectrogram) ** 2 + np.imag(complex_spectrogram) ** 2 + self.mel_floor
+        )
+        mel_spectrogram = np.matmul(self.mel_filters.T, amplitude_spectrogram)
+
+        # Perform spectral normalization to get the log mel spectrogram.
+        log_mel_spectrogram = np.log(
+            np.clip(mel_spectrogram, a_min=self.compression_clip_val, a_max=None) * self.compression_factor
+        )
+
+        # Return spectrogram with num_mel_bins last
+        return log_mel_spectrogram.T
+
+    def generate_noise(
+        self,
+        noise_length: int,
+        generator: Optional[np.random.Generator] = None,
+    ) -> np.ndarray:
+        """
+        Generates a random noise sequence of standard Gaussian noise for use in the `noise_sequence` argument of
+        [`UnivNetModel.forward`].
+
+        Args:
+            spectrogram_length (`int`):
+                The length (dim 0) of the generated noise.
+            model_in_channels (`int`, *optional*, defaults to `None`):
+                The number of features (dim 1) of the generated noise. This should correspond to the
+                `model_in_channels` of the [`UnivNetGan`] model. If not set, this will default to
+                `self.config.model_in_channels`.
+            generator (`numpy.random.Generator`, *optional*, defaults to `None`)
+                An optional `numpy.random.Generator` random number generator to control noise generation. If not set, a
+                new generator with fresh entropy will be created.
+
+        Returns:
+            `numpy.ndarray`: Array containing random standard Gaussian noise of shape `(noise_length,
+            model_in_channels)`.
+        """
+        if generator is None:
+            generator = np.random.default_rng()
+
+        noise_shape = (noise_length, self.model_in_channels)
+        noise = generator.standard_normal(noise_shape, dtype=np.float32)
+
+        return noise
+
+    def batch_decode(self, waveforms, waveform_lengths=None) -> List[np.ndarray]:
+        r"""
+        Removes padding from generated audio after running [`UnivNetModel.forward`]. This returns a ragged list of 1D
+        audio waveform arrays and not a single tensor/array because in general the waveforms will have different
+        lengths after removing padding.
+
+        Args:
+            waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                The batched output waveforms from the [`UnivNetModel`].
+            waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`, *optional*):
+                The batched lengths of each waveform before padding.
+
+        Returns:
+            `List[np.ndarray]`: A ragged list of 1D waveform arrays with padding removed.
+        """
+        # Collapse the batched waveform tensor to a list of 1D audio waveforms
+        waveforms = [waveform.detach().clone().cpu().numpy() for waveform in waveforms]
+
+        if waveform_lengths is not None:
+            waveforms = [waveform[: waveform_lengths[i]] for i, waveform in enumerate(waveforms)]
+
+        return waveforms
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]],
+        sampling_rate: Optional[int] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        max_length: Optional[int] = None,
+        truncation: bool = True,
+        pad_to_multiple_of: Optional[int] = None,
+        return_noise: bool = True,
+        generator: Optional[np.random.Generator] = None,
+        pad_end: bool = False,
+        pad_length: Optional[int] = None,
+        do_normalize: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the input `raw_speech` waveforms (according to the model's padding side and
+                padding index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+
+                If `pad_end = True`, that padding will occur before the `padding` strategy is applied.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`, *optional*, defaults to `True`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_noise (`bool`, *optional*, defaults to `True`):
+                Whether to generate and return a noise waveform for use in [`UnivNetModel.forward`].
+            generator (`numpy.random.Generator`, *optional*, defaults to `None`):
+                An optional `numpy.random.Generator` random number generator to use when generating noise.
+            pad_end (`bool`, *optional*, defaults to `False`):
+                Whether to pad the end of each waveform with silence. This can help reduce artifacts at the end of the
+                generated audio sample; see https://github.com/seungwonpark/melgan/issues/8 for more details. This
+                padding will be done before the padding strategy specified in `padding` is performed.
+            pad_length (`int`, *optional*, defaults to `None`):
+                If padding the end of each waveform, the length of the padding in spectrogram frames. If not set, this
+                will default to `self.config.pad_end_length`.
+            do_normalize (`bool`, *optional*):
+                Whether to perform Tacotron 2 normalization on the input. Normalizing can help to significantly improve
+                the performance for some models. If not set, this will default to `self.config.do_normalize`.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.np.array` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+        """
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self.__class__.__name__} was trained using a"
+                    f" sampling rate of {self.sampling_rate}. Please make sure that the provided `raw_speech` input"
+                    f" was sampled with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray(raw_speech, dtype=np.float32)]
+
+        # Pad end to reduce artifacts
+        if pad_end:
+            pad_length = pad_length if pad_length is not None else self.pad_end_length
+            raw_speech = [
+                np.pad(waveform, (0, pad_length * self.hop_length), constant_values=self.padding_value)
+                for waveform in raw_speech
+            ]
+
+        batched_speech = BatchFeature({"input_features": raw_speech})
+
+        padded_inputs = self.pad(
+            batched_speech,
+            padding=padding,
+            max_length=max_length if max_length is not None else self.num_max_samples,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        # make sure list is in array format
+        # input_features = padded_inputs.get("input_features").transpose(2, 0, 1)
+        input_features = padded_inputs.get("input_features")
+
+        mel_spectrograms = [self.mel_spectrogram(waveform) for waveform in input_features]
+
+        if isinstance(input_features[0], List):
+            batched_speech["input_features"] = [np.asarray(mel, dtype=np.float32) for mel in mel_spectrograms]
+        else:
+            batched_speech["input_features"] = [mel.astype(np.float32) for mel in mel_spectrograms]
+
+        # convert attention_mask to correct format
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            batched_speech["padding_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        if return_noise:
+            noise = [
+                self.generate_noise(spectrogram.shape[0], generator)
+                for spectrogram in batched_speech["input_features"]
+            ]
+            batched_speech["noise_sequence"] = noise
+
+        if do_normalize:
+            batched_speech["input_features"] = [
+                self.normalize(spectrogram) for spectrogram in batched_speech["input_features"]
+            ]
+
+        if return_tensors is not None:
+            batched_speech = batched_speech.convert_to_tensors(return_tensors)
+
+        return batched_speech
+
+    def to_dict(self) -> Dict[str, Any]:
+        output = super().to_dict()
+
+        # Don't serialize these as they are derived from the other properties.
+        names = ["window", "mel_filters", "n_fft", "n_freqs", "num_max_samples"]
+        for name in names:
+            if name in output:
+                del output[name]
+
+        return output
diff --git a/src/transformers/models/univnet/modeling_univnet.py b/src/transformers/models/univnet/modeling_univnet.py
new file mode 100644
index 00000000000000..dc9beddec525b8
--- /dev/null
+++ b/src/transformers/models/univnet/modeling_univnet.py
@@ -0,0 +1,636 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch UnivNetModel model."""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...modeling_utils import ModelOutput, PreTrainedModel
+from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from .configuration_univnet import UnivNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "UnivNetConfig"
+
+_CHECKPOINT_FOR_DOC = "dg845/univnet-dev"
+
+UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "dg845/univnet-dev",
+    # See all UnivNet models at https://huggingface.co/models?filter=univnet
+]
+
+
+@dataclass
+class UnivNetModelOutput(ModelOutput):
+    """
+    Output class for the [`UnivNetModel`], which includes the generated audio waveforms and the original unpadded
+    lengths of those waveforms (so that the padding can be removed by [`UnivNetModel.batch_decode`]).
+
+    Args:
+        waveforms (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Batched 1D (mono-channel) output audio waveforms.
+        waveform_lengths (`torch.FloatTensor` of shape `(batch_size,)`):
+            The batched length in samples of each unpadded waveform in `waveforms`.
+    """
+
+    waveforms: torch.FloatTensor = None
+    waveform_lengths: torch.FloatTensor = None
+
+
+class UnivNetKernelPredictorResidualBlock(nn.Module):
+    """
+    Implementation of the residual block for the kernel predictor network inside each location variable convolution
+    block (LVCBlock).
+
+    Parameters:
+        config: (`UnivNetConfig`):
+            Config for the `UnivNetModel` model.
+    """
+
+    def __init__(
+        self,
+        config: UnivNetConfig,
+    ):
+        super().__init__()
+        self.channels = config.model_in_channels
+        self.kernel_size = config.kernel_predictor_conv_size
+        self.dropout_prob = config.kernel_predictor_dropout
+        self.leaky_relu_slope = config.leaky_relu_slope
+
+        padding = (self.kernel_size - 1) // 2
+
+        self.dropout = nn.Dropout(self.dropout_prob)
+        self.conv1 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
+        self.conv2 = nn.Conv1d(self.channels, self.channels, self.kernel_size, padding=padding, bias=True)
+
+    def forward(self, hidden_states: torch.FloatTensor):
+        # hidden_states should have shape (batch_size, channels, seq_length)
+        residual = hidden_states
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        return hidden_states + residual
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv1)
+        nn.utils.weight_norm(self.conv2)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv1)
+        nn.utils.remove_weight_norm(self.conv2)
+
+
+class UnivNetKernelPredictor(nn.Module):
+    """
+    Implementation of the kernel predictor network which supplies the kernel and bias for the location variable
+    convolutional layers (LVCs) in each UnivNet LVCBlock.
+
+    Based on the KernelPredictor implementation in
+    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L7).
+
+    Parameters:
+        config: (`UnivNetConfig`):
+            Config for the `UnivNetModel` model.
+        conv_kernel_size (`int`, *optional*, defaults to 3):
+            The kernel size for the location variable convolutional layer kernels (convolutional weight tensor).
+        conv_layers (`int`, *optional*, defaults to 4):
+            The number of location variable convolutional layers to output kernels and biases for.
+    """
+
+    def __init__(
+        self,
+        config: UnivNetConfig,
+        conv_kernel_size: int = 3,
+        conv_layers: int = 4,
+    ):
+        super().__init__()
+
+        self.conv_in_channels = config.model_hidden_channels
+        self.conv_out_channels = 2 * config.model_hidden_channels
+        self.conv_kernel_size = conv_kernel_size
+        self.conv_layers = conv_layers
+
+        self.kernel_channels = (
+            self.conv_in_channels * self.conv_out_channels * self.conv_kernel_size * self.conv_layers
+        )
+        self.bias_channels = self.conv_out_channels * self.conv_layers
+
+        self.resnet_in_channels = config.num_mel_bins
+        self.resnet_hidden_channels = config.kernel_predictor_hidden_channels
+        self.resnet_kernel_size = config.kernel_predictor_conv_size
+        self.num_blocks = config.kernel_predictor_num_blocks
+
+        self.leaky_relu_slope = config.leaky_relu_slope
+
+        padding = (self.resnet_kernel_size - 1) // 2
+
+        self.input_conv = nn.Conv1d(self.resnet_in_channels, self.resnet_hidden_channels, 5, padding=2, bias=True)
+
+        self.resblocks = nn.ModuleList([UnivNetKernelPredictorResidualBlock(config) for _ in range(self.num_blocks)])
+
+        self.kernel_conv = nn.Conv1d(
+            self.resnet_hidden_channels, self.kernel_channels, self.resnet_kernel_size, padding=padding, bias=True
+        )
+        self.bias_conv = nn.Conv1d(
+            self.resnet_hidden_channels, self.bias_channels, self.resnet_kernel_size, padding=padding, bias=True
+        )
+
+    def forward(self, spectrogram: torch.FloatTensor):
+        """
+        Maps a conditioning log-mel spectrogram to a tensor of convolutional kernels and biases, for use in location
+        variable convolutional layers. Note that the input spectrogram should have shape (batch_size, input_channels,
+        seq_length).
+
+        Args:
+            spectrogram (`torch.FloatTensor` of shape `(batch_size, input_channels, seq_length)`):
+                Tensor containing the log-mel spectrograms.
+
+        Returns:
+            Tuple[`torch.FloatTensor, `torch.FloatTensor`]: tuple of tensors where the first element is the tensor of
+            location variable convolution kernels of shape `(batch_size, self.conv_layers, self.conv_in_channels,
+            self.conv_out_channels, self.conv_kernel_size, seq_length)` and the second element is the tensor of
+            location variable convolution biases of shape `(batch_size, self.conv_layers. self.conv_out_channels,
+            seq_length)`.
+        """
+        batch_size, _, seq_length = spectrogram.shape
+
+        hidden_states = self.input_conv(spectrogram)
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+
+        for resblock in self.resblocks:
+            hidden_states = resblock(hidden_states)
+
+        kernel_hidden_states = self.kernel_conv(hidden_states)
+        bias_hidden_states = self.bias_conv(hidden_states)
+
+        # Reshape kernels and biases to appropriate shape
+        kernels = kernel_hidden_states.view(
+            batch_size,
+            self.conv_layers,
+            self.conv_in_channels,
+            self.conv_out_channels,
+            self.conv_kernel_size,
+            seq_length,
+        ).contiguous()
+        biases = bias_hidden_states.view(
+            batch_size,
+            self.conv_layers,
+            self.conv_out_channels,
+            seq_length,
+        ).contiguous()
+
+        return kernels, biases
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.input_conv)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.kernel_conv)
+        nn.utils.weight_norm(self.bias_conv)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.input_conv)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.kernel_conv)
+        nn.utils.remove_weight_norm(self.bias_conv)
+
+
+class UnivNetLvcResidualBlock(nn.Module):
+    """
+    Implementation of the location variable convolution (LVC) residual block for the UnivNet residual network.
+
+    Parameters:
+        config: (`UnivNetConfig`):
+            Config for the `UnivNetModel` model.
+        kernel_size (`int`):
+            The kernel size for the dilated 1D convolutional layer.
+        dilation (`int`):
+            The dilation for the dilated 1D convolutional layer.
+    """
+
+    def __init__(
+        self,
+        config: UnivNetConfig,
+        kernel_size: int,
+        dilation: int,
+    ):
+        super().__init__()
+        self.hidden_channels = config.model_hidden_channels
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+        self.leaky_relu_slope = config.leaky_relu_slope
+
+        padding = self.dilation * (self.kernel_size - 1) // 2
+
+        self.conv = nn.Conv1d(
+            self.hidden_channels,
+            self.hidden_channels,
+            self.kernel_size,
+            padding=padding,
+            dilation=self.dilation,
+        )
+
+    def forward(self, hidden_states, kernel, bias, hop_size=256):
+        residual = hidden_states
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        hidden_states = self.location_variable_convolution(hidden_states, kernel, bias, hop_size=hop_size)
+        # Gated activation unit
+        hidden_states = torch.sigmoid(hidden_states[:, : self.hidden_channels, :]) * torch.tanh(
+            hidden_states[:, self.hidden_channels :, :]
+        )
+        # Skip connection
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+    # Based on https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L171
+    def location_variable_convolution(
+        self,
+        hidden_states: torch.FloatTensor,
+        kernel: torch.FloatTensor,
+        bias: torch.FloatTensor,
+        dilation: int = 1,
+        hop_size: int = 256,
+    ):
+        """
+        Performs location-variable convolution operation on the input sequence (hidden_states) using the local
+        convolution kernel. This was introduced in [LVCNet: Efficient Condition-Dependent Modeling Network for Waveform
+        Generation](https://arxiv.org/abs/2102.10815) by Zhen Zheng, Jianzong Wang, Ning Cheng, and Jing Xiao.
+
+        Time: 414 μs ± 309 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each), test on NVIDIA V100.
+
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, in_channels, in_length)`):
+                The input sequence of shape (batch, in_channels, in_length).
+            kernel (`torch.FloatTensor` of shape `(batch_size, in_channels, out_channels, kernel_size, kernel_length)`):
+                The local convolution kernel of shape (batch, in_channels, out_channels, kernel_size, kernel_length).
+            bias (`torch.FloatTensor` of shape `(batch_size, out_channels, kernel_length)`):
+                The bias for the local convolution of shape (batch, out_channels, kernel_length).
+            dilation (`int`, *optional*, defaults to 1):
+                The dilation of convolution.
+            hop_size (`int`, *optional*, defaults to 256):
+                The hop_size of the conditioning sequence.
+        Returns:
+            `torch.FloatTensor`: the output sequence after performing local convolution with shape (batch_size,
+            out_channels, in_length).
+        """
+        batch, _, in_length = hidden_states.shape
+        batch, _, out_channels, kernel_size, kernel_length = kernel.shape
+        if in_length != (kernel_length * hop_size):
+            raise ValueError(
+                f"Dim 2 of `hidden_states` should be {kernel_length * hop_size}) but got {in_length}. Please check"
+                " `hidden_states` or `kernel` and `hop_size` to make sure they are correct."
+            )
+
+        padding = dilation * int((kernel_size - 1) / 2)
+
+        # (batch, in_channels, in_length + 2*padding)
+        hidden_states = nn.functional.pad(hidden_states, (padding, padding), "constant", 0)
+        # (batch, in_channels, kernel_length, hop_size + 2*padding)
+        hidden_states = hidden_states.unfold(2, hop_size + 2 * padding, hop_size)
+
+        if hop_size < dilation:
+            hidden_states = nn.functional.pad(hidden_states, (0, dilation), "constant", 0)
+        # (batch, in_channels, kernel_length, (hop_size + 2*padding)/dilation, dilation)
+        hidden_states = hidden_states.unfold(3, dilation, dilation)
+        hidden_states = hidden_states[:, :, :, :, :hop_size]
+        # (batch, in_channels, kernel_length, dilation, (hop_size + 2*padding)/dilation)
+        hidden_states = hidden_states.transpose(3, 4)
+        # (batch, in_channels, kernel_length, dilation, _, kernel_size)
+        hidden_states = hidden_states.unfold(4, kernel_size, 1)
+
+        # Apply local convolution kernel to hidden_states.
+        output_hidden_states = torch.einsum("bildsk,biokl->bolsd", hidden_states, kernel)
+
+        output_hidden_states = output_hidden_states.to(memory_format=torch.channels_last_3d)
+        bias = bias.unsqueeze(-1).unsqueeze(-1).to(memory_format=torch.channels_last_3d)
+        output_hidden_states = output_hidden_states + bias
+        output_hidden_states = output_hidden_states.contiguous().view(batch, out_channels, -1)
+
+        return output_hidden_states
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv)
+
+
+class UnivNetLvcBlock(nn.Module):
+    """
+    Implementation of the location variable convolution (LVC) residual block of the UnivNet residual block. Includes a
+    `UnivNetKernelPredictor` inside to predict the kernels and biases of the LVC layers.
+
+    Based on LVCBlock in
+    [maum-ai/univnet](https://github.com/maum-ai/univnet/blob/9bb2b54838bb6d7ce767131cc7b8b61198bc7558/model/lvcnet.py#L98)
+
+    Parameters:
+        config (`UnivNetConfig`):
+            Config for the `UnivNetModel` model.
+        layer_id (`int`):
+            An integer corresponding to the index of the current LVC resnet block layer. This should be between 0 and
+            `len(config.resblock_stride_sizes) - 1)` inclusive.
+        lvc_hop_size (`int`, *optional*, defaults to 256):
+            The hop size for the location variable convolutional layers.
+    """
+
+    def __init__(
+        self,
+        config: UnivNetConfig,
+        layer_id: int,
+        lvc_hop_size: int = 256,
+    ):
+        super().__init__()
+        self.hidden_channels = config.model_hidden_channels
+        self.kernel_size = config.resblock_kernel_sizes[layer_id]
+        self.stride = config.resblock_stride_sizes[layer_id]
+        self.dilations = config.resblock_dilation_sizes[layer_id]
+        self.cond_hop_length = lvc_hop_size
+        self.leaky_relu_slope = config.leaky_relu_slope
+        self.num_blocks = len(self.dilations)
+
+        self.convt_pre = nn.ConvTranspose1d(
+            self.hidden_channels,
+            self.hidden_channels,
+            2 * self.stride,
+            stride=self.stride,
+            padding=self.stride // 2 + self.stride % 2,
+            output_padding=self.stride % 2,
+        )
+
+        self.kernel_predictor = UnivNetKernelPredictor(config, self.kernel_size, self.num_blocks)
+
+        self.resblocks = nn.ModuleList(
+            [UnivNetLvcResidualBlock(config, self.kernel_size, self.dilations[i]) for i in range(self.num_blocks)]
+        )
+
+    def forward(self, hidden_states: torch.FloatTensor, spectrogram: torch.FloatTensor):
+        # hidden_states: (batch_size, hidden_channels, seq_length)
+        # spectrogram: (batch_size, cond_channels, cond_length)
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        hidden_states = self.convt_pre(hidden_states)
+
+        kernels, biases = self.kernel_predictor(spectrogram)
+
+        for i, resblock in enumerate(self.resblocks):
+            kernel = kernels[:, i, :, :, :, :]
+            bias = biases[:, i, :, :]
+            hidden_states = resblock(hidden_states, kernel, bias, hop_size=self.cond_hop_length)
+
+        return hidden_states
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.convt_pre)
+        self.kernel_predictor.apply_weight_norm()
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.convt_pre)
+        self.kernel_predictor.remove_weight_norm()
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+
+
+UNIVNET_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`UnivNetConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+UNIVNET_INPUTS_DOCSTRING = r"""
+    Converts a noise waveform and a conditioning spectrogram to a speech waveform. Passing a batch of log-mel
+    spectrograms returns a batch of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a
+    single, un-batched speech waveform.
+
+    Args:
+        input_features (`torch.FloatTensor`):
+            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+            config.num_mel_channels)`, or un-batched and of shape `(sequence_length, config.num_mel_channels)`.
+        noise_sequence (`torch.FloatTensor`, *optional*):
+            Tensor containing a noise sequence of standard Gaussian noise. Can be batched and of shape `(batch_size,
+            sequence_length, config.model_in_channels)`, or un-batched and of shape (sequence_length,
+            config.model_in_channels)`. If not supplied, will be randomly generated.
+        padding_mask (`torch.BoolTensor`, *optional*):
+            Mask indicating which parts of each sequence are padded. Mask values are selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**
+            - 0 for tokens that are **masked**
+
+            The mask can be batched and of shape `(batch_size, sequence_length)` or un-batched and of shape
+            `(sequence_length,)`.
+        generator (`torch.Generator`, *optional*):
+            A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+            deterministic.
+        return_dict:
+            Whether to return a [`~utils.ModelOutput`] subclass instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """UnivNet GAN vocoder.""",
+    UNIVNET_START_DOCSTRING,
+)
+class UnivNetModel(PreTrainedModel):
+    config_class = UnivNetConfig
+    main_input_name = "input_features"
+
+    def __init__(self, config: UnivNetConfig):
+        super().__init__(config)
+
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.leaky_relu_slope = config.leaky_relu_slope
+
+        self.conv_pre = nn.Conv1d(
+            config.model_in_channels,
+            config.model_hidden_channels,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+            padding_mode="reflect",
+        )
+
+        # Initialize location-variable convolution ResNet Blocks.
+        num_layers = len(config.resblock_stride_sizes)
+        hop_length = 1
+        hop_lengths = []
+        for stride in config.resblock_stride_sizes:
+            hop_length = hop_length * stride
+            hop_lengths.append(hop_length)
+
+        self.resblocks = nn.ModuleList(
+            [
+                UnivNetLvcBlock(
+                    config,
+                    layer_id=i,
+                    lvc_hop_size=hop_lengths[i],
+                )
+                for i in range(num_layers)
+            ]
+        )
+
+        self.conv_post = nn.Conv1d(config.model_hidden_channels, 1, 7, padding=3, padding_mode="reflect")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(UNIVNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=UnivNetModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_features: torch.FloatTensor,
+        noise_sequence: Optional[torch.FloatTensor] = None,
+        padding_mask: Optional[torch.FloatTensor] = None,
+        generator: Optional[torch.Generator] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.FloatTensor], UnivNetModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+         ```python
+         >>> from transformers import UnivNetFeatureExtractor, UnivNetModel
+         >>> from datasets import load_dataset, Audio
+
+         >>> model = UnivNetModel.from_pretrained("dg845/univnet-dev")
+         >>> feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
+
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> # Resample the audio to the feature extractor's sampling rate.
+         >>> ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+         >>> inputs = feature_extractor(
+         ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+         ... )
+         >>> audio = model(**inputs).waveforms
+         >>> list(audio.shape)
+         [1, 140288]
+         ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # Resolve batch sizes for noise_sequence and spectrogram
+        spectrogram_batched = input_features.dim() == 3
+        if not spectrogram_batched:
+            input_features = input_features.unsqueeze(0)
+        spectrogram_batch_size, spectrogram_length, _ = input_features.shape
+
+        if noise_sequence is not None:
+            noise_sequence_batched = noise_sequence.dim() == 3
+            if not noise_sequence_batched:
+                noise_sequence = noise_sequence.unsqueeze(0)
+        else:
+            # Randomly generate noise_sequence
+            noise_sequence_shape = (spectrogram_batch_size, spectrogram_length, self.config.model_in_channels)
+            noise_sequence = torch.randn(
+                noise_sequence_shape, generator=generator, dtype=input_features.dtype, device=input_features.device
+            )
+        noise_sequence_batch_size = noise_sequence.shape[0]
+
+        if spectrogram_batch_size > 1 and noise_sequence_batch_size == 1:
+            # Repeat noise_sequence spectrogram_batch_size times
+            noise_sequence = noise_sequence.repeat(spectrogram_batch_size, 1, 1)
+        elif noise_sequence_batch_size > 1 and spectrogram_batch_size == 1:
+            # Repeat spectrogram noise_sequence_batch_size times
+            input_features = input_features.repeat(noise_sequence_batch_size, 1, 1)
+
+        if noise_sequence_batch_size != spectrogram_batch_size:
+            raise ValueError(
+                f"The batch size of `noise_sequence` is {noise_sequence_batch_size} and the batch size of"
+                f" `input_features` is {spectrogram_batch_size}, but the two are expected to be equal."
+            )
+
+        if padding_mask is not None:
+            if padding_mask.dim() == 1:
+                padding_mask = padding_mask.unsqueeze(0)
+            padding_mask_batch_size = padding_mask.shape[0]
+            if padding_mask_batch_size != spectrogram_batch_size:
+                raise ValueError(
+                    f"The batch size of `padding_mask` is {padding_mask_batch_size} and the batch size of"
+                    f" `input_features` is {spectrogram_batch_size}, but the two are expected to be equal."
+                )
+
+        # Change shapes to have channels before sequence lengths
+        hidden_states = noise_sequence.transpose(2, 1)
+        input_features = input_features.transpose(2, 1)
+
+        hidden_states = self.conv_pre(hidden_states)
+
+        for resblock in self.resblocks:
+            hidden_states = resblock(hidden_states, input_features)
+
+        hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        # Remove sequence length dimension since this collapses to 1
+        # NOTE: keep waveforms batched even if there's only one
+        waveform = hidden_states.squeeze(1)
+
+        # Get sequence lengths for UnivNetFeatureExtractor.batch_decode.
+        waveform_lengths = None
+        if padding_mask is not None:
+            # Padding is always contiguous and added on the right
+            waveform_lengths = torch.sum(padding_mask, dim=1)
+
+        if not return_dict:
+            outputs = (waveform, waveform_lengths)
+            return outputs
+
+        return UnivNetModelOutput(
+            waveforms=waveform,
+            waveform_lengths=waveform_lengths,
+        )
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def apply_weight_norm(self):
+        nn.utils.weight_norm(self.conv_pre)
+        for layer in self.resblocks:
+            layer.apply_weight_norm()
+        nn.utils.weight_norm(self.conv_post)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.conv_pre)
+        for layer in self.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.conv_post)
diff --git a/src/transformers/models/vipllava/__init__.py b/src/transformers/models/vipllava/__init__.py
new file mode 100644
index 00000000000000..2853605ba2d275
--- /dev/null
+++ b/src/transformers/models/vipllava/__init__.py
@@ -0,0 +1,54 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
+
+
+_import_structure = {"configuration_vipllava": ["VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "VipLlavaConfig"]}
+
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_vipllava"] = [
+        "VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "VipLlavaForConditionalGeneration",
+        "VipLlavaPreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_vipllava import VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, VipLlavaConfig
+
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_vipllava import (
+            VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
+            VipLlavaForConditionalGeneration,
+            VipLlavaPreTrainedModel,
+        )
+
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py
new file mode 100644
index 00000000000000..977506a3d51258
--- /dev/null
+++ b/src/transformers/models/vipllava/configuration_vipllava.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" VipLlava model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+VIPLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "ybelkada/vip-llava-7b-hf": "https://huggingface.co/llava-hf/vip-llava-7b-hf/resolve/main/config.json",
+}
+
+
+class VipLlavaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VipLlavaForConditionalGeneration`]. It is used to instantiate an
+    VipLlava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VipLlava-9B.
+
+    e.g. [ybelkada/vip-llava-7b-hf](https://huggingface.co/ybelkada/vip-llava-7b-hf)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`VipLlavaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        ignore_index (`int`, *optional*, defaults to -100):
+            The ignore index for the loss function.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        projector_layernorm_eps (`float`, *optional*, defaults to 1e-05):
+            The layer norm epsilon of the projector layernorm
+        vision_feature_layers (`List[int]`, *optional*, defaults to `[-2, -5, -8, -11, 6]`):
+            The list of layers to select the vision features from.
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the VipLlava model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~VipLlavaForConditionalGeneration`]
+
+    Example:
+
+    ```python
+    >>> from transformers import VipLlavaForConditionalGeneration, VipLlavaConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a VipLlava vipllava-7b style configuration
+    >>> configuration = VipLlavaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the vipllava-7b style configuration
+    >>> model = VipLlavaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vipllava"
+    is_composition = False
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        ignore_index=-100,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        projector_layernorm_eps=1e-5,
+        vision_feature_layers=[-2, -5, -8, -11, 6],
+        vocab_size=32000,
+        **kwargs,
+    ):
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.projector_layernorm_eps = projector_layernorm_eps
+        self.vision_feature_layers = vision_feature_layers
+        self.vocab_size = vocab_size
+
+        self.vision_config = vision_config
+
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = (
+                vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
+            )
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+        self.vocab_size = self.vocab_size
+
+        self.text_config = text_config
+
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+            self.vocab_size = self.text_config.vocab_size
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["llama"]()
+
+        super().__init__(**kwargs)
diff --git a/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py b/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
new file mode 100644
index 00000000000000..a96d56084ce008
--- /dev/null
+++ b/src/transformers/models/vipllava/convert_vipllava_weights_to_hf.py
@@ -0,0 +1,132 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import torch
+from huggingface_hub import hf_hub_download
+
+from transformers import (
+    AddedToken,
+    AutoConfig,
+    AutoTokenizer,
+    CLIPImageProcessor,
+    LlavaProcessor,
+    VipLlavaConfig,
+    VipLlavaForConditionalGeneration,
+)
+
+
+KEYS_TO_MODIFY_MAPPING = {
+    "model.vision_tower.": "",
+    "model.mm_projector": "multi_modal_projector",
+    "model": "model.model",
+    "vision_model.model": "vision_model",
+    "lm_head": "language_model.lm_head",
+    "model.model": "language_model.model",
+    "multi_modal_projector.0": "multi_modal_projector.linear_1",
+    "multi_modal_projector.2": "multi_modal_projector.linear_2",
+    "final_linear.0": "linear_1",
+    "final_linear.2": "linear_2",
+    "multi_modal_projector.clip_layernorm": "multi_modal_projector.projector_layernorm",
+}
+
+
+# Copied from transformers.models.llava.convert_llava_weights_to_hf.convert_state_dict_to_hf
+def convert_state_dict_to_hf(state_dict):
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
+            if key_to_modify in key:
+                key = key.replace(key_to_modify, new_key)
+        new_state_dict[key] = value
+    return new_state_dict
+
+
+def convert_vipllava_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id):
+    torch.set_default_dtype(torch.float16)
+    text_config = AutoConfig.from_pretrained(text_model_id)
+
+    tokenizer = AutoTokenizer.from_pretrained(text_model_id)
+    tokenizer.add_tokens(AddedToken("<image>", special=True, normalized=False))
+    tokenizer.add_special_tokens({"pad_token": "<pad>"})
+
+    image_processor = CLIPImageProcessor.from_pretrained(vision_model_id)
+
+    processor = LlavaProcessor(tokenizer=tokenizer, image_processor=image_processor)
+
+    config = VipLlavaConfig(text_config=text_config)
+    config.pad_token_id = 32001
+
+    with torch.device("meta"):
+        model = VipLlavaForConditionalGeneration(config)
+
+    # Pad to 64 for performance reasons
+    pad_shape = 64
+
+    state_dict_path = hf_hub_download(old_state_dict_id, "model_state_dict_7b.bin")
+
+    state_dict = torch.load(state_dict_path, map_location="cpu")
+    state_dict = convert_state_dict_to_hf(state_dict)
+
+    model.load_state_dict(state_dict, strict=True, assign=True)
+
+    pre_expansion_embeddings = model.language_model.model.embed_tokens.weight.data
+    mu = torch.mean(pre_expansion_embeddings, dim=0).float()
+    n = pre_expansion_embeddings.size()[0]
+    sigma = ((pre_expansion_embeddings - mu).T @ (pre_expansion_embeddings - mu)) / n
+    dist = torch.distributions.multivariate_normal.MultivariateNormal(mu, covariance_matrix=1e-5 * sigma)
+
+    # We add an image token so we resize the model
+    model.resize_token_embeddings(config.text_config.vocab_size + 2, pad_shape)
+    model.language_model.model.embed_tokens.weight.data[32000:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.model.embed_tokens.weight.data[32000:].shape[0]))),
+        dim=0,
+    )
+    model.language_model.lm_head.weight.data[32000:] = torch.stack(
+        tuple((dist.sample() for _ in range(model.language_model.lm_head.weight.data[32000:].shape[0]))),
+        dim=0,
+    )
+    model.config.vocab_size = model.config.vocab_size + pad_shape
+    model.config.text_config.vocab_size = model.config.text_config.vocab_size + pad_shape
+
+    model.push_to_hub(output_hub_path)
+    processor.push_to_hub(output_hub_path)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--text_model_id",
+        help="Hub location of the text model",
+    )
+    parser.add_argument(
+        "--vision_model_id",
+        help="Hub location of the vision model",
+    )
+    parser.add_argument(
+        "--output_hub_path",
+        help="Location on the hub of the converted model",
+    )
+    parser.add_argument(
+        "--old_state_dict_id",
+        help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`",
+    )
+    args = parser.parse_args()
+    convert_vipllava_llama_to_hf(
+        args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py
new file mode 100644
index 00000000000000..0b1dc3fa86b383
--- /dev/null
+++ b/src/transformers/models/vipllava/modeling_vipllava.py
@@ -0,0 +1,533 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch VipLlava model."""
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ... import PreTrainedModel
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...modeling_outputs import ModelOutput
+from ...utils import (
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ..auto import AutoModel, AutoModelForCausalLM
+from .configuration_vipllava import VipLlavaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "VipLlavaConfig"
+
+VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "llava-hf/vip-llava-7b-hf",
+    # See all VipLlava models at https://huggingface.co/models?filter=vipllava
+]
+
+
+@dataclass
+# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->VipLlava
+class VipLlavaCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for VipLlava causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+            sequence_length, hidden_size)`.
+
+            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+
+
+class VipLlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: VipLlavaConfig):
+        super().__init__()
+        self.projector_layernorm = nn.LayerNorm(
+            len(config.vision_feature_layers) * config.vision_config.hidden_size, eps=config.projector_layernorm_eps
+        )
+
+        self.linear_1 = nn.Linear(
+            len(config.vision_feature_layers) * config.vision_config.hidden_size,
+            config.text_config.hidden_size,
+            bias=True,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.projector_layernorm(hidden_states)
+        hidden_states = self.linear_1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+VIPLLAVA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`VipLlavaConfig`] or [`VipLlavaVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare VipLlava Model outputting raw hidden-states without any specific head on top.",
+    VIPLLAVA_START_DOCSTRING,
+)
+# Copied from transformers.models.llava.modeling_llava.LlavaPreTrainedModel with Llava->VipLlava,llava->vipllava
+class VipLlavaPreTrainedModel(PreTrainedModel):
+    config_class = VipLlavaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["VipLlavaVisionAttention"]
+    _supports_flash_attn_2 = True
+
+    def _init_weights(self, module):
+        # important: this ported version of VipLlava isn't meant for training from scratch - only
+        # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
+        # https://github.com/haotian-liu/LLaVA/tree/main/vipllava should serve for that purpose
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+VIPLLAVA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
+            [`CLIPImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    """The VIPLLAVA model which consists of a vision backbone and a language model.""",
+    VIPLLAVA_START_DOCSTRING,
+)
+# Copied from transformers.models.llava.modeling_llava.LlavaForConditionalGeneration with LLAVA->VIPLLAVA,Llava->VipLlava
+class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel):
+    def __init__(self, config: VipLlavaConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = VipLlavaMultiModalProjector(config)
+        self.vocab_size = config.vocab_size
+        self.language_model = AutoModelForCausalLM.from_config(
+            config.text_config, attn_implementation=config._attn_implementation
+        )
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+
+    def _merge_input_ids_with_image_features(
+        self, image_features, inputs_embeds, input_ids, attention_mask, position_ids
+    ):
+        num_images, num_image_patches, embed_dim = image_features.shape
+        batch_size, sequence_length = input_ids.shape
+        left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
+        # 1. Create a mask to know where special image tokens are
+        special_image_token_mask = input_ids == self.config.image_token_index
+        num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
+        # Compute the maximum embed dimension
+        max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
+        batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
+
+        # 2. Compute the positions where text should be written
+        # Calculate new positions for text tokens in merged image-text sequence.
+        # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+        # `torch.cumsum` computes how each image token shifts subsequent text token positions.
+        # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
+        new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
+        nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+        if left_padding:
+            new_token_positions += nb_image_pad[:, None]  # offset for left padding
+        text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+
+        # 3. Create the full embedding, already padded to the maximum position
+        final_embedding = torch.zeros(
+            batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
+        )
+        final_attention_mask = torch.zeros(
+            batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
+        )
+        # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
+        # set the corresponding tensors into their correct target device.
+        target_device = inputs_embeds.device
+        batch_indices, non_image_indices, text_to_overwrite = (
+            batch_indices.to(target_device),
+            non_image_indices.to(target_device),
+            text_to_overwrite.to(target_device),
+        )
+        attention_mask = attention_mask.to(target_device)
+
+        # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+        # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
+        final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
+        final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+
+        # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
+        image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
+        image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
+
+        if image_to_overwrite.sum() != image_features.shape[:-1].numel():
+            raise ValueError(
+                f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
+                f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
+            )
+
+        final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
+        final_attention_mask |= image_to_overwrite
+        position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
+        return final_embedding, final_attention_mask, position_ids
+
+    @add_start_docstrings_to_model_forward(VIPLLAVA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=VipLlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    # Ignore copy
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layers: Optional[List[int]] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, VipLlavaCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration
+
+        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vipllava-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/vipllava-7b-hf")
+
+        >>> prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(text=text, images=image, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on a green surface, with a red ball in its paw."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_feature_layers = (
+            vision_feature_layers if vision_feature_layers is not None else self.config.vision_feature_layers
+        )
+
+        if inputs_embeds is None:
+            # 1. Extra the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # 2. Merge text and images
+            if pixel_values is not None and input_ids.shape[1] != 1:
+                image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
+                # For VIP-llava, the image features are computed this way
+                # We select the features from index 1: for the layers -2, -5, -8, -11 and 6
+                image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers]
+                image_features = torch.cat(image_features, dim=-1)
+
+                image_features = self.multi_modal_projector(image_features)
+                inputs_embeds, attention_mask, position_ids = self._merge_input_ids_with_image_features(
+                    image_features, inputs_embeds, input_ids, attention_mask, position_ids
+                )
+                if labels is None:
+                    labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
+            else:
+                # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+                # generation with cache
+                if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                    # Retrieve the first layer to inspect the logits and mask out the hidden states
+                    # that are set to 0
+                    first_layer_past_key_value = past_key_values[0][0][:, 0, :, 0]
+                    batch_index, non_attended_tokens = torch.where(first_layer_past_key_value == 0)
+                    # Get the target length
+                    target_seqlen = first_layer_past_key_value.shape[-1] + 1
+
+                    extended_attention_mask = torch.ones(
+                        (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+
+                    # Zero-out the places where we don't need to attend
+                    extended_attention_mask[batch_index, non_attended_tokens] = 0
+
+                    attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                    position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return VipLlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
+    ):
+        if past_key_values is not None:
+            if isinstance(past_key_values, Cache):
+                cache_length = past_key_values.get_seq_length()
+                past_length = past_key_values.seen_tokens
+            else:
+                cache_length = past_length = past_key_values[0][0].shape[2]
+
+            # Keep only the unprocessed tokens:
+            # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
+            # some of the inputs are exclusivelly passed as part of the cache (e.g. when passing input_embeds as
+            # input)
+            if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
+                input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
+            # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
+            # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
+                input_ids = input_ids[:, past_length:]
+            # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
+            elif self.config.image_token_index in input_ids:
+                input_ids = input_ids[:, input_ids.shape[1] - 1 :]
+            # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
+            # older attention values, as their corresponding values are not part of the input.
+            if cache_length < past_length and attention_mask is not None:
+                attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
+
+        position_ids = kwargs.get("position_ids", None)
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "pixel_values": pixel_values,
+            }
+        )
+        return model_inputs
+
+    def _reorder_cache(self, *args, **kwargs):
+        return self.language_model._reorder_cache(*args, **kwargs)
diff --git a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
index 65f55d55e82181..395d02bf0bf854 100644
--- a/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
+++ b/src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
@@ -290,33 +290,7 @@ def get_output_embeddings(self):
     def set_output_embeddings(self, new_embeddings):
         return self.decoder.set_output_embeddings(new_embeddings)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r"""
-        Example:
-
-        ```python
-        >>> from transformers import TFVisionEncoderDecoderModel, AutoImageProcessor, AutoTokenizer
-        >>> from PIL import Image
-        >>> import requests
-
-        >>> image_processor = AutoImageProcessor.from_pretrained("ydshieh/vit-gpt2-coco-en")
-        >>> decoder_tokenizer = AutoTokenizer.from_pretrained("ydshieh/vit-gpt2-coco-en")
-        >>> model = TFVisionEncoderDecoderModel.from_pretrained("ydshieh/vit-gpt2-coco-en")
-
-        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-        >>> img = Image.open(requests.get(url, stream=True).raw)
-        >>> pixel_values = image_processor(images=img, return_tensors="tf").pixel_values  # Batch size 1
-
-        >>> output_ids = model.generate(
-        ...     pixel_values, max_length=16, num_beams=4, return_dict_in_generate=True
-        ... ).sequences
-
-        >>> preds = decoder_tokenizer.batch_decode(output_ids, skip_special_tokens=True)
-        >>> preds = [pred.strip() for pred in preds]
-
-        >>> assert preds == ["a cat laying on top of a couch next to another cat"]
-        ```"""
+    def tf_to_pt_weight_rename(self, tf_weight):
         # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
         # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
         # However, the name of that extra layer is the name of the MainLayer in the base model. We make the assumption
@@ -327,18 +301,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         # often safetensors now, we don't know if we're going to be crossloading until we sniff the weights file.
         # Therefore, we specify tf_to_pt_weight_rename anyway, and let the super method figure out if it needs it
         # or not.
-
-        config = AutoConfig.from_pretrained(pretrained_model_name_or_path)
-        encoder_model_type = config.encoder.model_type
-
-        def tf_to_pt_weight_rename(tf_weight):
-            if "encoder" in tf_weight and "decoder" not in tf_weight:
-                return re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight)
-            else:
-                return tf_weight
-
-        kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        encoder_model_type = self.config.encoder.model_type
+        if "encoder" in tf_weight and "decoder" not in tf_weight:
+            return (re.sub(rf"encoder\.{encoder_model_type}\.", "encoder.", tf_weight),)
+        else:
+            return (tf_weight,)
 
     @classmethod
     def from_encoder_decoder_pretrained(
diff --git a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
index 34349c8661757c..d0e91640f688f8 100644
--- a/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
+++ b/src/transformers/models/vision_text_dual_encoder/modeling_tf_vision_text_dual_encoder.py
@@ -227,32 +227,24 @@ def build(self, input_shape=None):
         self.logit_scale = self.add_weight(shape=(1,), initializer=initializer, name="logit_scale")
         super().build(input_shape)
 
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+    def tf_to_pt_weight_rename(self, tf_weight):
         # Matt: The TF and PT weights don't align because our TF base classes have an extra layer compared to PT models
         # (the main model stem is in the MainLayer class). If we remove that layer, then weight names sync up as normal.
         # However, the name of that extra layer is the name of the MainLayer in the base model.
-
-        if kwargs.get("from_pt", False):
-
-            def tf_to_pt_weight_rename(tf_weight):
-                if "vision_model" in tf_weight:
-                    if tf_weight.count("vision_model") == 1:
-                        return re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight)
-                    elif tf_weight.count("vision_model") == 2:
-                        return re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight)
-                    else:
-                        raise ValueError(
-                            f"Unexpected weight name {tf_weight}. Please file an issue on the"
-                            " Transformers repo to let us know about this error!"
-                        )
-                elif "text_model" in tf_weight:
-                    return re.sub(r"text_model\..*?\.", "text_model.", tf_weight)
-                else:
-                    return tf_weight
-
-            kwargs["tf_to_pt_weight_rename"] = tf_to_pt_weight_rename
-        return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        if "vision_model" in tf_weight:
+            if tf_weight.count("vision_model") == 1:
+                return re.sub(r"vision_model\..*?\.", "vision_model.", tf_weight)
+            elif tf_weight.count("vision_model") == 2:
+                return re.sub(r"vision_model\..*?\.vision_model", "vision_model.vision_model", tf_weight)
+            else:
+                raise ValueError(
+                    f"Unexpected weight name {tf_weight}. Please file an issue on the"
+                    " Transformers repo to let us know about this error!"
+                )
+        elif "text_model" in tf_weight:
+            return re.sub(r"text_model\..*?\.", "text_model.", tf_weight)
+        else:
+            return (tf_weight,)
 
     @add_start_docstrings_to_model_forward(VISION_TEXT_DUAL_ENCODER_TEXT_INPUTS_DOCSTRING)
     def get_text_features(
diff --git a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
index b73c5f346dba57..0ccd9b9f6685fe 100644
--- a/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
+++ b/src/transformers/models/vit/convert_vit_timm_to_pytorch.py
@@ -16,14 +16,13 @@
 
 
 import argparse
-import json
 from pathlib import Path
 
 import requests
 import timm
 import torch
-from huggingface_hub import hf_hub_download
 from PIL import Image
+from timm.data import ImageNetInfo, infer_imagenet_subset
 
 from transformers import DeiTImageProcessor, ViTConfig, ViTForImageClassification, ViTImageProcessor, ViTModel
 from transformers.utils import logging
@@ -60,13 +59,11 @@ def create_rename_keys(config, base_model=False):
     )
 
     if base_model:
-        # layernorm + pooler
+        # layernorm
         rename_keys.extend(
             [
                 ("norm.weight", "layernorm.weight"),
                 ("norm.bias", "layernorm.bias"),
-                ("pre_logits.fc.weight", "pooler.dense.weight"),
-                ("pre_logits.fc.bias", "pooler.dense.bias"),
             ]
         )
 
@@ -140,60 +137,68 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     # define default ViT configuration
     config = ViTConfig()
     base_model = False
-    # dataset (ImageNet-21k only or also fine-tuned on ImageNet 2012), patch_size and image_size
-    if vit_name[-5:] == "in21k":
-        base_model = True
-        config.patch_size = int(vit_name[-12:-10])
-        config.image_size = int(vit_name[-9:-6])
-    else:
-        config.num_labels = 1000
-        repo_id = "huggingface/label-files"
-        filename = "imagenet-1k-id2label.json"
-        id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r"))
-        id2label = {int(k): v for k, v in id2label.items()}
-        config.id2label = id2label
-        config.label2id = {v: k for k, v in id2label.items()}
-        config.patch_size = int(vit_name[-6:-4])
-        config.image_size = int(vit_name[-3:])
-    # size of the architecture
-    if "deit" in vit_name:
-        if vit_name[9:].startswith("tiny"):
-            config.hidden_size = 192
-            config.intermediate_size = 768
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 3
-        elif vit_name[9:].startswith("small"):
-            config.hidden_size = 384
-            config.intermediate_size = 1536
-            config.num_hidden_layers = 12
-            config.num_attention_heads = 6
-        else:
-            pass
-    else:
-        if vit_name[4:].startswith("small"):
-            config.hidden_size = 768
-            config.intermediate_size = 2304
-            config.num_hidden_layers = 8
-            config.num_attention_heads = 8
-        elif vit_name[4:].startswith("base"):
-            pass
-        elif vit_name[4:].startswith("large"):
-            config.hidden_size = 1024
-            config.intermediate_size = 4096
-            config.num_hidden_layers = 24
-            config.num_attention_heads = 16
-        elif vit_name[4:].startswith("huge"):
-            config.hidden_size = 1280
-            config.intermediate_size = 5120
-            config.num_hidden_layers = 32
-            config.num_attention_heads = 16
 
     # load original model from timm
     timm_model = timm.create_model(vit_name, pretrained=True)
     timm_model.eval()
 
-    # load state_dict of original model, remove and rename some keys
+    # detect unsupported ViT models in transformers
+    # fc_norm is present
+    if not isinstance(getattr(timm_model, "fc_norm", None), torch.nn.Identity):
+        raise ValueError(f"{vit_name} is not supported in transformers because of the presence of fc_norm.")
+
+    # use of global average pooling in combination (or without) class token
+    if getattr(timm_model, "global_pool", None) == "avg":
+        raise ValueError(f"{vit_name} is not supported in transformers because of use of global average pooling.")
+
+    # CLIP style vit with norm_pre layer present
+    if "clip" in vit_name and not isinstance(getattr(timm_model, "norm_pre", None), torch.nn.Identity):
+        raise ValueError(
+            f"{vit_name} is not supported in transformers because it's a CLIP style ViT with norm_pre layer."
+        )
+
+    # SigLIP style vit with attn_pool layer present
+    if "siglip" in vit_name and getattr(timm_model, "global_pool", None) == "map":
+        raise ValueError(
+            f"{vit_name} is not supported in transformers because it's a SigLIP style ViT with attn_pool."
+        )
+
+    # use of layer scale in ViT model blocks
+    if not isinstance(getattr(timm_model.blocks[0], "ls1", None), torch.nn.Identity) or not isinstance(
+        getattr(timm_model.blocks[0], "ls2", None), torch.nn.Identity
+    ):
+        raise ValueError(f"{vit_name} is not supported in transformers because it uses a layer scale in its blocks.")
+
+    # Hybrid ResNet-ViTs
+    if not isinstance(timm_model.patch_embed, timm.layers.PatchEmbed):
+        raise ValueError(f"{vit_name} is not supported in transformers because it is a hybrid ResNet-ViT.")
+
+    # get patch size and image size from the patch embedding submodule
+    config.patch_size = timm_model.patch_embed.patch_size[0]
+    config.image_size = timm_model.patch_embed.img_size[0]
+
+    # retrieve architecture-specific parameters from the timm model
+    config.hidden_size = timm_model.embed_dim
+    config.intermediate_size = timm_model.blocks[0].mlp.fc1.out_features
+    config.num_hidden_layers = len(timm_model.blocks)
+    config.num_attention_heads = timm_model.blocks[0].attn.num_heads
+
+    # check whether the model has a classification head or not
+    if timm_model.num_classes != 0:
+        config.num_labels = timm_model.num_classes
+        # infer ImageNet subset from timm model
+        imagenet_subset = infer_imagenet_subset(timm_model)
+        dataset_info = ImageNetInfo(imagenet_subset)
+        config.id2label = {i: dataset_info.index_to_label_name(i) for i in range(dataset_info.num_classes())}
+        config.label2id = {v: k for k, v in config.id2label.items()}
+    else:
+        print(f"{vit_name} is going to be converted as a feature extractor only.")
+        base_model = True
+
+    # load state_dict of original model
     state_dict = timm_model.state_dict()
+
+    # remove and rename some keys in the state dict
     if base_model:
         remove_classification_head_(state_dict)
     rename_keys = create_rename_keys(config, base_model)
@@ -202,8 +207,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
     read_in_q_k_v(state_dict, config, base_model)
 
     # load HuggingFace model
-    if vit_name[-5:] == "in21k":
-        model = ViTModel(config).eval()
+    if base_model:
+        model = ViTModel(config, add_pooling_layer=False).eval()
     else:
         model = ViTForImageClassification(config).eval()
     model.load_state_dict(state_dict)
@@ -219,8 +224,8 @@ def convert_vit_checkpoint(vit_name, pytorch_dump_folder_path):
 
     if base_model:
         timm_pooled_output = timm_model.forward_features(pixel_values)
-        assert timm_pooled_output.shape == outputs.pooler_output.shape
-        assert torch.allclose(timm_pooled_output, outputs.pooler_output, atol=1e-3)
+        assert timm_pooled_output.shape == outputs.last_hidden_state.shape
+        assert torch.allclose(timm_pooled_output, outputs.last_hidden_state, atol=1e-1)
     else:
         timm_logits = timm_model(pixel_values)
         assert timm_logits.shape == outputs.logits.shape
diff --git a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
index 81a07a9d79575f..1e4b0652ff5b4e 100644
--- a/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
+++ b/src/transformers/models/vit_hybrid/image_processing_vit_hybrid.py
@@ -84,10 +84,6 @@ class ViTHybridImageProcessor(BaseImageProcessor):
             Can be overridden by the `image_std` parameter in the `preprocess` method.
         do_convert_rgb (`bool`, *optional*, defaults to `True`):
             Whether to convert the image to RGB.
-        use_square_size (`bool`, *optional*, defaults to `False`):
-            The value to be passed to `get_size_dict` as `default_to_square` when computing the image size. If the
-            `size` argument in `get_size_dict` is an `int`, it determines whether to default to a square image or not.
-            Note that this attribute is not used in computing `crop_size` via calling `get_size_dict`.
     """
 
     model_input_names = ["pixel_values"]
@@ -105,12 +101,11 @@ def __init__(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_convert_rgb: bool = True,
-        use_square_size: bool = False,
         **kwargs,
     ) -> None:
         super().__init__(**kwargs)
         size = size if size is not None else {"shortest_edge": 224}
-        size = get_size_dict(size, default_to_square=use_square_size)
+        size = get_size_dict(size, default_to_square=False)
         crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
         crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
 
@@ -125,7 +120,6 @@ def __init__(
         self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
         self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
         self.do_convert_rgb = do_convert_rgb
-        self.use_square_size = use_square_size
 
     # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
     def resize(
@@ -153,13 +147,19 @@ def resize(
             input_data_format (`ChannelDimension` or `str`, *optional*):
                 The channel dimension format of the input image. If not provided, it will be inferred.
         """
-        size = get_size_dict(size, default_to_square=self.use_square_size)
-        if "shortest_edge" not in size:
-            raise ValueError(f"The `size` parameter must contain the key `shortest_edge`. Got {size.keys()}")
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
         output_size = get_resize_output_image_size(
             image,
-            size=size["shortest_edge"],
-            default_to_square=self.use_square_size,
+            size=size,
+            default_to_square=default_to_square,
             input_data_format=input_data_format,
         )
         return resize(
@@ -243,7 +243,7 @@ def preprocess(
         """
         do_resize = do_resize if do_resize is not None else self.do_resize
         size = size if size is not None else self.size
-        size = get_size_dict(size, param_name="size", default_to_square=self.use_square_size)
+        size = get_size_dict(size, param_name="size", default_to_square=False)
         resample = resample if resample is not None else self.resample
         do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
         crop_size = crop_size if crop_size is not None else self.crop_size
diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py
index 4015875f0c7e27..7af69d28697cd8 100644
--- a/src/transformers/models/vitdet/modeling_vitdet.py
+++ b/src/transformers/models/vitdet/modeling_vitdet.py
@@ -815,8 +815,8 @@ def get_input_embeddings(self) -> VitDetEmbeddings:
     def forward(
         self,
         pixel_values: torch.Tensor,
-        output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
         return_dict: Optional[bool] = None,
     ) -> BackboneOutput:
         """
diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py
index 0d6cdaa95882a5..8d29a8434acde4 100755
--- a/src/transformers/models/whisper/convert_openai_to_hf.py
+++ b/src/transformers/models/whisper/convert_openai_to_hf.py
@@ -21,13 +21,22 @@
 import tempfile
 import urllib
 import warnings
+from typing import Any, Optional, Tuple
 
 import torch
 from huggingface_hub.utils import insecure_hashlib
 from torch import nn
 from tqdm import tqdm
 
-from transformers import WhisperConfig, WhisperForConditionalGeneration, WhisperTokenizer
+from transformers import (
+    GenerationConfig,
+    WhisperConfig,
+    WhisperFeatureExtractor,
+    WhisperForConditionalGeneration,
+    WhisperProcessor,
+    WhisperTokenizer,
+    WhisperTokenizerFast,
+)
 from transformers.models.whisper.tokenization_whisper import LANGUAGES, bytes_to_unicode
 from transformers.utils.import_utils import _is_package_available
 
@@ -43,14 +52,47 @@
     "medium": "https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt",
     "large": "https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large.pt",
     "large-v2": "https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt",
+    "large-v3": "https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt",
 }
 
+
 _TOKENIZERS = {
     "multilingual": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/multilingual.tiktoken",
     "english": "https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/gpt2.tiktoken",
 }
 
 
+def _get_generation_config(
+    is_multilingual: bool,
+    num_languages: int = 100,
+    openai_version: Optional[str] = None,
+) -> GenerationConfig:
+    """
+    Loads the appropriate generation config from HF repo
+    """
+    if openai_version is not None:
+        repo = f"openai/whisper-{openai_version}"
+    elif not is_multilingual:
+        repo = "openai/whisper-medium.en"
+    elif num_languages < 100:
+        repo = "openai/whisper-large-v2"
+    else:
+        repo = "openai/whisper-large-v3"
+
+    gen_cfg = GenerationConfig.from_pretrained(repo)
+    if openai_version is None:
+        gen_cfg.alignment_heads = None
+        warnings.warn(
+            "Alignment heads have not been included in the generation config, since they are available "
+            "only for the original OpenAI checkpoints."
+            "If you want to use word-level timestamps with a custom version of Whisper,"
+            "see https://github.com/openai/whisper/blob/main/notebooks/Multilingual_ASR.ipynb"
+            "for the example of how to produce word-level timestamps manually."
+        )
+
+    return gen_cfg
+
+
 def remove_ignore_keys_(state_dict):
     ignore_keys = ["layers", "blocks"]
     for k in ignore_keys:
@@ -102,7 +144,7 @@ def make_linear_from_emb(emb):
     return lin_layer
 
 
-def _download(url: str, root: str) -> io.BytesIO:
+def _download(url: str, root: str) -> Any:
     os.makedirs(root, exist_ok=True)
     filename = os.path.basename(url)
 
@@ -140,12 +182,17 @@ def _download(url: str, root: str) -> io.BytesIO:
     return torch.load(io.BytesIO(model_bytes))
 
 
-def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
+def convert_openai_whisper_to_tfms(
+    checkpoint_path, pytorch_dump_folder_path
+) -> Tuple[WhisperForConditionalGeneration, bool, int]:
     if ".pt" not in checkpoint_path:
         root = os.path.dirname(pytorch_dump_folder_path) or "."
         original_checkpoint = _download(_MODELS[checkpoint_path], root)
+        openai_version = checkpoint_path
     else:
         original_checkpoint = torch.load(checkpoint_path, map_location="cpu")
+        openai_version = None
+
     dimensions = original_checkpoint["dims"]
     state_dict = original_checkpoint["model_state_dict"]
     proj_out_weights = state_dict["decoder.token_embedding.weight"]
@@ -154,6 +201,9 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
     tie_embeds = True
     ffn_dim = state_dict["decoder.layers.0.fc1.weight"].shape[0]
 
+    # a hacky way to properly set up the bos/eos/pad token ids in the model
+    endoftext_id = 50257 if dimensions["n_vocab"] > 51865 else 50256
+
     config = WhisperConfig(
         vocab_size=dimensions["n_vocab"],
         encoder_ffn_dim=ffn_dim,
@@ -166,6 +216,10 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
         decoder_layers=dimensions["n_text_layer"],
         decoder_attention_heads=dimensions["n_text_head"],
         max_source_positions=dimensions["n_audio_ctx"],
+        eos_token_id=endoftext_id,
+        bos_token_id=endoftext_id,
+        pad_token_id=endoftext_id,
+        decoder_start_token_id=endoftext_id + 1,
     )
 
     model = WhisperForConditionalGeneration(config)
@@ -184,7 +238,17 @@ def convert_openai_whisper_to_tfms(checkpoint_path, pytorch_dump_folder_path):
     else:
         model.proj_out.weight.data = proj_out_weights
 
-    model.save_pretrained(pytorch_dump_folder_path)
+    # determine those parameters from a model checkpoint as Whisper repo does
+    is_multilingual = model.config.vocab_size >= 51865
+    num_languages = model.config.vocab_size - 51765 - int(is_multilingual)
+
+    model.generation_config = _get_generation_config(
+        is_multilingual,
+        num_languages,
+        openai_version,
+    )
+
+    return model, is_multilingual, num_languages
 
 
 # Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960
@@ -225,7 +289,7 @@ def token_bytes_to_string(b):
 
 
 def convert_tiktoken_to_hf(
-    pytorch_dump_folder_path: str, multilingual: bool = True, num_languages: int = 100, time_precision=0.02
+    multilingual: bool = True, num_languages: int = 100, time_precision=0.02
 ) -> WhisperTokenizer:
     # requires whisper, unless we use the path to the tiktoken file
     tiktoken_tokenizer_path = _TOKENIZERS["multilingual" if multilingual else "english"]
@@ -260,35 +324,27 @@ def convert_tiktoken_to_hf(
 
     hf_tokenizer.add_tokens(start_of_transcript + language_tokens + control_tokens, special_tokens=True)
     hf_tokenizer.add_tokens(timestamp_tokens, special_tokens=False)
-    hf_tokenizer.save_pretrained(pytorch_dump_folder_path)
+    return hf_tokenizer
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     # # Required parameters
-    parser.add_argument("--checkpoint_path", type=str, help="Patht to the downloaded checkpoints")
+    parser.add_argument("--checkpoint_path", type=str, help="Path to the downloaded checkpoints")
     parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
     parser.add_argument(
-        "--convert_tokenizer",
+        "--convert_preprocessor",
         type=bool,
         default=False,
-        help="Whether or not the tokenizer should be converted along with the model.",
-    )
-    parser.add_argument(
-        "--whisper_version",
-        type=int,
-        default=2,
-        help="Version of the whisper release",
-    )
-    parser.add_argument(
-        "--multilingual",
-        type=bool,
-        default="store_true",
-        help="Whether or not the model is multilingual or english only",
+        help="Whether or not the preprocessor (tokenizer + feature extractor) should be converted along with the model.",
     )
     args = parser.parse_args()
 
-    if args.convert_tokenizer:
+    model, is_multilingual, num_languages = convert_openai_whisper_to_tfms(
+        args.checkpoint_path, args.pytorch_dump_folder_path
+    )
+
+    if args.convert_preprocessor:
         try:
             if not _is_package_available("tiktoken"):
                 raise """`tiktoken` is not installed, use `pip install tiktoken` to convert the tokenizer"""
@@ -297,9 +353,16 @@ def convert_tiktoken_to_hf(
         else:
             from tiktoken.load import load_tiktoken_bpe
 
-            NUM_LANGUAGES_PER_RELEASE = {1: 99, 2: 99, 3: 100}
-            convert_tiktoken_to_hf(
-                args.pytorch_dump_folder_path, args.multilingual, NUM_LANGUAGES_PER_RELEASE[args.whisper_version]
+            tokenizer = convert_tiktoken_to_hf(is_multilingual, num_languages)
+            feature_extractor = WhisperFeatureExtractor(
+                feature_size=model.config.num_mel_bins,
+                # the rest of default parameters are the same as hardcoded in openai/whisper
             )
+            processor = WhisperProcessor(tokenizer=tokenizer, feature_extractor=feature_extractor)
+            processor.save_pretrained(args.pytorch_dump_folder_path)
+
+            # save fast tokenizer as well
+            fast_tokenizer = WhisperTokenizerFast.from_pretrained(args.pytorch_dump_folder_path)
+            fast_tokenizer.save_pretrained(args.pytorch_dump_folder_path, legacy_format=False)
 
-    convert_openai_whisper_to_tfms(args.checkpoint_path, args.pytorch_dump_folder_path)
+    model.save_pretrained(args.pytorch_dump_folder_path)
diff --git a/src/transformers/models/whisper/modeling_tf_whisper.py b/src/transformers/models/whisper/modeling_tf_whisper.py
index 6789758076142b..20c5bf73513b07 100644
--- a/src/transformers/models/whisper/modeling_tf_whisper.py
+++ b/src/transformers/models/whisper/modeling_tf_whisper.py
@@ -1495,6 +1495,11 @@ def generate(
                         f"Unsupported language: {generation_config.language}. Language should be one of:"
                         f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                     )
+                if language_token not in generation_config.lang_to_id:
+                    raise ValueError(
+                        f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                        "(You should just add it to the generation config)"
+                    )
                 forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
             else:
                 forced_decoder_ids.append((1, None))  # automatically detect the language
diff --git a/src/transformers/models/whisper/modeling_whisper.py b/src/transformers/models/whisper/modeling_whisper.py
index ad54d51b73f3e6..e7bcb47acdf082 100644
--- a/src/transformers/models/whisper/modeling_whisper.py
+++ b/src/transformers/models/whisper/modeling_whisper.py
@@ -14,7 +14,9 @@
 # limitations under the License.
 """ PyTorch Whisper model."""
 
+import copy
 import math
+import warnings
 from typing import Optional, Tuple, Union
 
 import numpy as np
@@ -26,7 +28,7 @@
 
 from ...activations import ACT2FN
 from ...generation.logits_process import WhisperTimeStampLogitsProcessor
-from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa
 from ...modeling_outputs import (
     BaseModelOutput,
     BaseModelOutputWithPastAndCrossAttentions,
@@ -40,6 +42,7 @@
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_2_available,
+    is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 )
@@ -477,6 +480,15 @@ class WhisperFlashAttention2(WhisperAttention):
     flash attention and deal with padding tokens in case the input contains any of them.
     """
 
+    # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
+
     def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
         return tensor.view(bsz, seq_len, self.num_heads, self.head_dim)
 
@@ -601,6 +613,12 @@ def _flash_attention_forward(
             softmax_scale (`float`, *optional*):
                 The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
         """
+        if not self._flash_attn_uses_top_left_mask:
+            causal = self.is_causal
+        else:
+            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__.
+            causal = self.is_causal and query_length != 1
+
         # Contains at least one padding token in the sequence
         if attention_mask is not None:
             batch_size = query_states.shape[0]
@@ -621,13 +639,13 @@ def _flash_attention_forward(
                 max_seqlen_k=max_seqlen_in_batch_k,
                 dropout_p=dropout,
                 softmax_scale=softmax_scale,
-                causal=self.is_causal,
+                causal=causal,
             )
 
             attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
         else:
             attn_output = flash_attn_func(
-                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=self.is_causal
+                query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
             )
 
         return attn_output
@@ -672,9 +690,113 @@ def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query
         )
 
 
+class WhisperSdpaAttention(WhisperAttention):
+    # Copied from transformers.models.bart.modeling_bart.BartSdpaAttention.forward with BART->whisper, Bart->Whisper
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+        if output_attentions or layer_head_mask is not None:
+            # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented.
+            logger.warning_once(
+                "WhisperModel is using WhisperSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention"
+                ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
+            )
+            return super().forward(
+                hidden_states,
+                key_value_states=key_value_states,
+                past_key_value=past_key_value,
+                attention_mask=attention_mask,
+                layer_head_mask=layer_head_mask,
+                output_attentions=output_attentions,
+            )
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        # `past_key_value[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_value` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_value is not None
+            and past_key_value[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_value[0], key_states], dim=2)
+            value_states = torch.cat([past_key_value[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        query_states = self._shape(query_states, tgt_len, bsz)
+
+        # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask,
+        # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            dropout_p=self.dropout if self.training else 0.0,
+            # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1.
+            is_causal=self.is_causal and attention_mask is None and tgt_len > 1,
+        )
+
+        if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+
 WHISPER_ATTENTION_CLASSES = {
-    "default": WhisperAttention,
+    "eager": WhisperAttention,
     "flash_attention_2": WhisperFlashAttention2,
+    "sdpa": WhisperSdpaAttention,
 }
 
 
@@ -683,9 +805,8 @@ class WhisperEncoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = WHISPER_ATTENTION_CLASSES[attn_type](
+        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.encoder_attention_heads,
             dropout=config.attention_dropout,
@@ -755,9 +876,8 @@ class WhisperDecoderLayer(nn.Module):
     def __init__(self, config: WhisperConfig):
         super().__init__()
         self.embed_dim = config.d_model
-        attn_type = "flash_attention_2" if getattr(config, "_flash_attn_2_enabled", False) else "default"
 
-        self.self_attn = WHISPER_ATTENTION_CLASSES[attn_type](
+        self.self_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
             embed_dim=self.embed_dim,
             num_heads=config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -770,7 +890,7 @@ def __init__(self, config: WhisperConfig):
         self.activation_dropout = config.activation_dropout
 
         self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
-        self.encoder_attn = WHISPER_ATTENTION_CLASSES[attn_type](
+        self.encoder_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
             self.embed_dim,
             config.decoder_attention_heads,
             dropout=config.attention_dropout,
@@ -879,6 +999,7 @@ class WhisperPreTrainedModel(PreTrainedModel):
     supports_gradient_checkpointing = True
     _no_split_modules = ["WhisperEncoderLayer", "WhisperDecoderLayer"]
     _supports_flash_attn_2 = True
+    _supports_sdpa = True
 
     def _init_weights(self, module):
         std = self.config.init_std
@@ -1111,6 +1232,13 @@ def forward(
             return_dict (`bool`, *optional*):
                 Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
         """
+
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            raise ValueError(
+                f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+            )
+
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1202,6 +1330,8 @@ def __init__(self, config: WhisperConfig):
         self.embed_positions = WhisperPositionalEmbedding(self.max_target_positions, config.d_model)
 
         self.layers = nn.ModuleList([WhisperDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
+        self._use_sdpa = config._attn_implementation == "sdpa"
 
         self.layer_norm = nn.LayerNorm(config.d_model)
 
@@ -1311,9 +1441,14 @@ def forward(
         if inputs_embeds is None:
             inputs_embeds = self.embed_tokens(input_ids)
 
-        if getattr(self.config, "_flash_attn_2_enabled", False):
+        if self._use_flash_attention_2:
             # 2d mask is passed through the layers
             attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
+        elif self._use_sdpa and head_mask is None and not output_attentions:
+            # output_attentions=True & head_mask can not be supported when using SDPA.
+            attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
+                attention_mask, input_shape, inputs_embeds, past_key_values_length
+            )
         else:
             # 4d mask is passed through the layers
             attention_mask = _prepare_4d_causal_attention_mask(
@@ -1723,7 +1858,7 @@ def forward(
 
     def generate(
         self,
-        inputs: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
         generation_config=None,
         logits_processor=None,
         stopping_criteria=None,
@@ -1734,12 +1869,16 @@ def generate(
         language=None,
         is_multilingual=None,
         prompt_ids: Optional[torch.Tensor] = None,
-        return_token_timestamps=None,
+        num_segment_frames: Optional[int] = None,
+        return_token_timestamps: Optional[bool] = None,
+        return_segments: bool = False,
+        attention_mask: Optional[torch.Tensor] = None,
+        time_precision: int = 0.02,
+        return_dict_in_generate: Optional[bool] = None,
         **kwargs,
     ):
         """
-
-        Generates sequences of token ids for models with a language modeling head.
+        Transcribes or translates passed mel input features to a sequence of token ids.
 
         <Tip warning={true}>
 
@@ -1801,46 +1940,162 @@ def generate(
                 Whether to return token-level timestamps with the text. This can be used with or without the
                 `return_timestamps` option. To get word-level timestamps, use the tokenizer to group the tokens into
                 words.
+            return_segments (`bool`, *optional*, defaults to `False`):
+                Whether to additionally return a list of all segments. Note that this option can only be enabled
+                when doing long-form transcription.
+            attention_mask (`torch.Tensor`, *optional*):
+                `attention_mask` needs to be passed when doing long-form transcription using a batch size > 1.
+            time_precision (`int`, *optional*, defaults to 0.02):
+                The duration of output token in seconds. *E.g.* 0.02 means that a generated token on average accounts
+                for 20 ms.
+            return_dict_in_generate (`bool`, *optional*, defaults to `False`):
+                Whether or not to return a [`~utils.ModelOutput`] instead of just returning the generated tokens.
+                Note that when doing long-form transcription, `return_dict_in_generate` can only be enabled when
+                `return_segments` is set True. In this case the generation outputs of each segment is added to each
+                segment.
             kwargs (`Dict[str, Any]`, *optional*):
                 Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                 forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                 specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
 
         Return:
-            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
-            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
-
-                If the model is *not* an encoder-decoder model (`model.config.is_encoder_decoder=False`), the possible
-                [`~utils.ModelOutput`] types are:
+            [`~utils.ModelOutput`] or `torch.LongTensor` or `Dict[str, Any]`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor` or a dict of segments when `return_segments=True`.
 
-                    - [`~generation.GreedySearchDecoderOnlyOutput`],
-                    - [`~generation.SampleDecoderOnlyOutput`],
-                    - [`~generation.BeamSearchDecoderOnlyOutput`],
-                    - [`~generation.BeamSampleDecoderOnlyOutput`]
+                If the passed input is > 30 seconds / > 3000 mel input features and `return_segments=True` then a dictionary of generated sequence ids, called `sequences` and a list of each generated segment is returned.
 
-                If the model is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
-                [`~utils.ModelOutput`] types are:
+                else if the passed input is <= 30 seconds / >= 3000 mel input features, the possible [`~utils.ModelOutput`] types are:
 
                     - [`~generation.GreedySearchEncoderDecoderOutput`],
                     - [`~generation.SampleEncoderDecoderOutput`],
                     - [`~generation.BeamSearchEncoderDecoderOutput`],
                     - [`~generation.BeamSampleEncoderDecoderOutput`]
+
+                else only the generated output sequence ids are returned.
+
+        Example:
+
+        - *Longform transcription*: To transcribe or translate audios longer than 30 seconds, process the audio files without truncation and pass all mel features at once to generate.
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset, Audio
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        >>> model.cuda()
+
+        >>> # load audios > 30 seconds
+        >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        >>> # resample to 16kHz
+        >>> ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+        >>> # take first 8 audios and retrieve array
+        >>> audio = ds[:8]["audio"]
+        >>> audio = [x["array"] for x in audio]
+
+        >>> # make sure to NOT truncate the input audio, to return the `attention_mask` and to pad to the longest audio
+        >>> inputs = processor(audio, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True, sampling_rate=16_000)
+        >>> inputs = inputs.to("cuda", torch.float32)
+
+        >>> # transcribe audio to ids
+        >>> generated_ids = model.generate(**inputs)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)
+        >>> transcription[0]
+        ' Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile!'
+        ```
+
+        - *Shortform transcription*: If passed mel input features are < 30 seconds, the whole audio will be transcribed with a single call to generate.
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+
+        >>> generated_ids = model.generate(inputs=input_features)
+
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```
+
         """
+
+        if "inputs" in kwargs:
+            input_features = kwargs.pop("inputs")
+            warnings.warn(
+                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
+                FutureWarning,
+            )
+
+        return_dict_in_generate = (
+            return_dict_in_generate
+            if return_dict_in_generate is not None
+            else self.generation_config.return_dict_in_generate
+        )
+
         if generation_config is None:
-            generation_config = self.generation_config
+            generation_config = copy.deepcopy(self.generation_config)
+
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+        if num_segment_frames is None:
+            num_segment_frames = input_stride * self.config.max_source_positions
+
+        # 1. Check whether we're in shortform or longform mode
+        if input_features is not None:
+            total_input_frames = input_features.shape[-1]
+        elif "encoder_outputs" in kwargs:
+            encoder_outputs_shape = (
+                kwargs["encoder_outputs"][0].shape
+                if isinstance(kwargs["encoder_outputs"], BaseModelOutput)
+                else kwargs["encoder_outputs"].shape
+            )
+            total_input_frames = encoder_outputs_shape[1] * input_stride
+        else:
+            raise ValueError("Make sure to provide either `input_features` or `encoder_outputs` to `generate`.")
+
+        is_shortform = total_input_frames <= num_segment_frames
 
-        if return_timestamps is not None:
+        # 2. Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+        if return_timestamps is True:
             if not hasattr(generation_config, "no_timestamps_token_id"):
                 raise ValueError(
                     "You are trying to return timestamps, but the generation config is not properly set. "
                     "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
                     "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
                 )
-
             generation_config.return_timestamps = return_timestamps
+        elif not is_shortform:
+            if return_timestamps is False:
+                raise ValueError(
+                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
+                    "requires the model to predict timestamp tokens. Please either pass `return_timestamps=True` or make sure to pass no more than 3000 mel input features."
+                )
+
+            if not hasattr(generation_config, "no_timestamps_token_id"):
+                raise ValueError(
+                    "You have passed more than 3000 mel input features (> 30 seconds) which automatically enables long-form generation which "
+                    "requires the generation config to have `no_timestamps_token_id` correctly. "
+                    "Make sure to initialize the generation config with the correct attributes that are needed such as `no_timestamps_token_id`. "
+                    "For more details on how to generate the approtiate config, refer to https://github.com/huggingface/transformers/issues/21878#issuecomment-1451902363"
+                    "or make sure to pass no more than 3000 mel input features."
+                )
+
+            logger.info("Setting `return_timestamps=True` for long-form generation.")
+            generation_config.return_timestamps = True
         else:
             generation_config.return_timestamps = False
 
+        # 3. Make sure to correctly set language-related parameters
         if is_multilingual is not None:
             if not hasattr(generation_config, "is_multilingual"):
                 raise ValueError(
@@ -1875,8 +2130,8 @@ def generate(
                 )
             generation_config.task = task
 
+        # 4. Add forced decoder ids depending on passed `language`, `task`,`prompt_ids`, `return_token_timestamps` and `return_timestamps`
         forced_decoder_ids = None
-
         # Legacy code for backward compatibility
         if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids is not None:
             forced_decoder_ids = self.config.forced_decoder_ids
@@ -1903,6 +2158,11 @@ def generate(
                         f"Unsupported language: {generation_config.language}. Language should be one of:"
                         f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
                     )
+                if language_token not in generation_config.lang_to_id:
+                    raise ValueError(
+                        f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                        "(You should just add it to the generation config)"
+                    )
                 forced_decoder_ids.append((1, generation_config.lang_to_id[language_token]))
             else:
                 forced_decoder_ids.append((1, None))  # automatically detect the language
@@ -1961,12 +2221,9 @@ def generate(
             forced_decoder_ids = [(rank + 1, token) for rank, token in enumerate(forced_decoder_ids)]
             generation_config.forced_decoder_ids = forced_decoder_ids
 
-        if generation_config.return_timestamps:
-            logits_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
-
         if return_token_timestamps:
             kwargs["output_attentions"] = True
-            kwargs["return_dict_in_generate"] = True
+            return_dict_in_generate = True
 
             if getattr(generation_config, "task", None) == "translate":
                 logger.warning("Token-level timestamps may not be reliable for task 'translate'.")
@@ -1979,23 +2236,267 @@ def generate(
             if kwargs.get("num_frames") is not None:
                 generation_config.num_frames = kwargs.pop("num_frames")
 
-        outputs = super().generate(
-            inputs,
-            generation_config,
-            logits_processor,
-            stopping_criteria,
-            prefix_allowed_tokens_fn,
-            synced_gpus,
-            **kwargs,
-        )
+        if generation_config.return_timestamps is True:
+            last_forced_decoder_ids = (
+                generation_config.forced_decoder_ids[-1][-1]
+                if hasattr(self.config, "forced_decoder_ids") and self.config.forced_decoder_ids
+                else None
+            )
+            if last_forced_decoder_ids == self.generation_config.no_timestamps_token_id:
+                # remove no_timestamp to be forcefully generated if we want to return timestamps
+                # this is also important to make sure `WhisperTimeStampLogitsProcessor` functions correctly
+                forced_decoder_ids = generation_config.forced_decoder_ids[:-1]
+                # Make sure that if list is empty we set it to None
+                generation_config.forced_decoder_ids = None if len(forced_decoder_ids) == 0 else forced_decoder_ids
+
+            timestamp_processor = [WhisperTimeStampLogitsProcessor(generation_config)]
+            logits_processor = (
+                timestamp_processor if logits_processor is None else timestamp_processor + logits_processor
+            )
 
-        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
-            num_frames = getattr(generation_config, "num_frames", None)
-            outputs["token_timestamps"] = self._extract_token_timestamps(
-                outputs, generation_config.alignment_heads, num_frames=num_frames
+        # 5. If we're in shortform mode, simple generate the whole input at once and return the output
+        if is_shortform:
+            outputs = super().generate(
+                input_features,
+                generation_config,
+                logits_processor,
+                stopping_criteria,
+                prefix_allowed_tokens_fn,
+                synced_gpus,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
             )
 
-        return outputs
+            if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+                num_frames = getattr(generation_config, "num_frames", None)
+                outputs["token_timestamps"] = self._extract_token_timestamps(
+                    outputs, generation_config.alignment_heads, num_frames=num_frames
+                )
+
+            return outputs
+
+        # 6. Else we're in longform mode which is more complex. We need to chunk the audio input depending on when the model generated
+        # timestamp tokens
+        # 6.1 Set running parameters for while loop
+        if not return_segments and return_dict_in_generate:
+            raise ValueError(
+                "Make sure to set `return_segments=True` to return generation outputs as part of the `'segments' key.`"
+            )
+
+        # if input is longer than 30 seconds we default to long-form generation
+        timestamp_begin = self.generation_config.no_timestamps_token_id + 1
+        # input stride is mel frames per encoder output vector which is the product of all conv strides
+        batch_size = input_features.shape[0]
+
+        if batch_size > 1 and attention_mask is None:
+            raise ValueError(
+                "When doing long-form audio transcription, make sure to pass an `attention_mask`. You can retrieve the `attention_mask` by doing `processor(audio, ..., return_attention_mask=True)` "
+            )
+        elif batch_size > 1:
+            max_frames = attention_mask.sum(-1).cpu().to(torch.long)
+            seek = torch.zeros((batch_size,), dtype=torch.long)
+        else:
+            max_frames = torch.ones((1,), dtype=torch.long) * total_input_frames
+            seek = torch.zeros((1,), dtype=torch.long)
+
+        current_segments = [[] for _ in range(batch_size)]
+        cur_to_prev_index_map = list(range(batch_size))
+
+        # batch size can decrease during the run
+        cur_bsz = prev_bsz = batch_size
+
+        # 6.2 Transcribe audio until we reach the end of all input audios
+        while (seek < max_frames).any():
+            prev_bsz = cur_bsz
+
+            # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+            # to know which original audio is being decoded
+            new_cur_to_prev_index_map = []
+            for i in range(prev_bsz):
+                prev_i = cur_to_prev_index_map[i]
+                if seek[prev_i] >= max_frames[prev_i]:
+                    cut_index = i + (cur_bsz - prev_bsz)
+                    cur_bsz -= 1
+                    input_features = torch.cat([input_features[:cut_index], input_features[cut_index + 1 :]], dim=0)
+                else:
+                    # cut out index that goes away
+                    new_cur_to_prev_index_map.append(prev_i)
+
+            # 6.4  Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+            cur_to_prev_index_map = new_cur_to_prev_index_map
+            time_offset = seek * time_precision / input_stride
+            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+
+            # 6.5 Make sure that all inputs are padded to the same input length
+            segment_input = []
+            for i in range(cur_bsz):
+                prev_i = cur_to_prev_index_map[i]
+                segment_input_slice = input_features[
+                    i : i + 1, :, seek[prev_i] : seek[prev_i] + seek_num_frames[prev_i]
+                ]
+
+                if segment_input_slice.shape[-1] < num_segment_frames:
+                    # pad to 3000 if necessary
+                    segment_input_slice = F.pad(
+                        segment_input_slice, pad=(0, num_segment_frames - segment_input_slice.shape[-1])
+                    )
+
+                segment_input.append(segment_input_slice)
+
+            segment_input = torch.cat(segment_input, dim=0)
+
+            # 6.6 Batch generate current chunk
+            seek_outputs = super().generate(
+                segment_input,
+                generation_config,
+                logits_processor,
+                stopping_criteria,
+                prefix_allowed_tokens_fn,
+                synced_gpus,
+                return_dict_in_generate=return_dict_in_generate,
+                **kwargs,
+            )
+
+            if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+                num_frames = getattr(generation_config, "num_frames", None)
+                seek_outputs["token_timestamps"] = self._extract_token_timestamps(
+                    seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+                )
+
+            if return_dict_in_generate:
+                seek_sequences = seek_outputs["sequences"]
+                seek_outputs = [
+                    {k: v[i] for k, v in seek_outputs.items()}
+                    for i in range(next(iter(seek_outputs.values())).size(0))
+                ]
+            else:
+                seek_sequences = seek_outputs
+
+            # 6.7 Loop over each decoded audio individually as each decoding can be of a different length
+            for i, seek_sequence in enumerate(seek_sequences):
+                prev_i = cur_to_prev_index_map[i]
+
+                # make sure we cut a predicted EOS token if we are not finished with the generation yet
+                is_not_final = (seek[prev_i] + num_segment_frames) < max_frames[prev_i]
+                if is_not_final and seek_sequence[-1] == self.generation_config.eos_token_id:
+                    seek_sequence = seek_sequence[:-1]
+
+                # remove all padding tokens
+                if seek_sequence[-1] == self.generation_config.pad_token_id:
+                    num_paddings = (seek_sequence == self.generation_config.pad_token_id).sum()
+                    seek_sequence = seek_sequence[:-num_paddings]
+
+                segments, segment_offset = self._retrieve_segment(
+                    seek_sequence=seek_sequence,
+                    seek_outputs=seek_outputs,
+                    time_offset=time_offset,
+                    timestamp_begin=timestamp_begin,
+                    seek_num_frames=seek_num_frames,
+                    cur_bsz=cur_bsz,
+                    time_precision=time_precision,
+                    input_stride=input_stride,
+                    prev_idx=prev_i,
+                    idx=i,
+                )
+
+                current_segments[prev_i] += segments
+                seek[prev_i] += segment_offset
+
+        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
+        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
+        sequences = []
+        max_total_length = 0
+        for current_segment_list in current_segments:
+            sequences.append(torch.cat([d["tokens"] for d in current_segment_list], dim=-1))
+            max_total_length = max(max_total_length, len(sequences[-1]))
+
+        for i in range(batch_size):
+            sequences[i] = F.pad(
+                sequences[i], pad=(0, max_total_length - len(sequences[i])), value=self.generation_config.pad_token_id
+            )
+
+        sequences = torch.stack(sequences, dim=0)
+
+        # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
+        if return_segments:
+            return {"sequences": sequences, "segments": current_segments}
+
+        return sequences
+
+    @staticmethod
+    def _retrieve_segment(
+        seek_sequence,
+        seek_outputs,
+        time_offset,
+        timestamp_begin,
+        seek_num_frames,
+        cur_bsz,
+        time_precision,
+        input_stride,
+        prev_idx,
+        idx,
+    ):
+        # find the predicted "end of segment" predictions of Whisper
+        # "end of segment" predictions occur whenever Whisper predicts a timestamp token
+        timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
+        single_timestamp_ending = timestamp_tokens[-2:].tolist() == cur_bsz * [[False, True]]
+        timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+
+        # If whisper predicted a "end of segment" via a timestep token, let's go ever each
+        # "end of segment" prediction and slice the decoding into segments accordingly
+        if len(timestamp_segment_indices) > 0:
+            # if the output contains two consecutive timestamp tokens
+            slices = timestamp_segment_indices.tolist()
+            segments = []
+            if single_timestamp_ending:
+                slices.append(len(seek_sequence))
+
+            last_slice = 0
+            # Add each segment to list of all segments
+            for current_slice in slices:
+                sliced_tokens = seek_sequence[last_slice + 1 : current_slice + 1]
+                start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
+                        "tokens": sliced_tokens,
+                        "result": seek_outputs[idx],
+                    }
+                )
+                last_slice = current_slice
+
+            if single_timestamp_ending:
+                # single timestamp at the end means no speech after the last timestamp.
+                segment_offset = seek_num_frames[prev_idx]
+            else:
+                # otherwise, ignore the unfinished segment and seek to the last timestamp
+                # here we throw away all predictions after the last predicted "end of segment"
+                # since we are cutting right in the middle of an audio
+                last_timestamp_pos = seek_sequence[last_slice].item() - timestamp_begin
+                segment_offset = last_timestamp_pos * input_stride
+        else:
+            # If whisper does not predict any "end of segment" token, then
+            # the whole decoding is considered a segment and we add it to the list of segments
+            timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
+            last_timestamp_pos = seek_num_frames[prev_idx]
+            if timestamps.numel() > 0 and timestamps[-1].item() != timestamp_begin:
+                # no consecutive timestamps but it has a timestamp; use the last one.
+                last_timestamp_pos = timestamps[-1].item() - timestamp_begin
+
+            segments = [
+                {
+                    "start": time_offset[prev_idx],
+                    "end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
+                    "tokens": seek_sequence,
+                    "result": seek_outputs[idx],
+                }
+            ]
+            segment_offset = seek_num_frames[prev_idx]
+
+        return segments, segment_offset
 
     def prepare_inputs_for_generation(
         self,
@@ -2229,7 +2730,7 @@ def forward(
         >>> predicted_ids = model.generate(input_features, assistant_model=assistant_model)
 
         >>> # decode token ids to text
-        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
         >>> transcription
         ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.'
         ```"""
diff --git a/src/transformers/models/xglm/modeling_tf_xglm.py b/src/transformers/models/xglm/modeling_tf_xglm.py
index e2890edeb665af..05f87eb5d31c50 100644
--- a/src/transformers/models/xglm/modeling_tf_xglm.py
+++ b/src/transformers/models/xglm/modeling_tf_xglm.py
@@ -924,3 +924,9 @@ def call(
             attentions=outputs.attentions,
             cross_attentions=outputs.cross_attentions,
         )
+
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if tf_weight == "lm_head.weight":
+            return tf_weight, "model.embed_tokens.weight"
+        else:
+            return (tf_weight,)
diff --git a/src/transformers/models/yolos/image_processing_yolos.py b/src/transformers/models/yolos/image_processing_yolos.py
index 937d3c4298772d..4b59fd5ef04905 100644
--- a/src/transformers/models/yolos/image_processing_yolos.py
+++ b/src/transformers/models/yolos/image_processing_yolos.py
@@ -35,6 +35,9 @@
 from ...image_utils import (
     IMAGENET_DEFAULT_MEAN,
     IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    AnnotionFormat,  # noqa: F401
     ChannelDimension,
     ImageInput,
     PILImageResampling,
@@ -43,12 +46,10 @@
     is_scaled_image,
     make_list_of_images,
     to_numpy_array,
-    valid_coco_detection_annotations,
-    valid_coco_panoptic_annotations,
     valid_images,
+    validate_annotations,
 )
 from ...utils import (
-    ExplicitEnum,
     TensorType,
     is_flax_available,
     is_jax_tensor,
@@ -77,15 +78,7 @@
 
 logger = logging.get_logger(__name__)
 
-AnnotationType = Dict[str, Union[int, str, List[Dict]]]
-
-
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = "coco_detection"
-    COCO_PANOPTIC = "coco_panoptic"
-
-
-SUPPORTED_ANNOTATION_FORMATS = (AnnotionFormat.COCO_DETECTION, AnnotionFormat.COCO_PANOPTIC)
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
 
 
 # Copied from transformers.models.detr.image_processing_detr.get_max_height_width
@@ -329,10 +322,13 @@ def prepare_coco_detection_annotation(
 
     if annotations and "keypoints" in annotations[0]:
         keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
         keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
         num_keypoints = keypoints.shape[0]
         keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
-        new_target["keypoints"] = keypoints[keep]
+        new_target["keypoints"] = keypoints
 
     if return_segmentation_masks:
         segmentation_masks = [obj["segmentation"] for obj in annotations]
@@ -709,7 +705,7 @@ class YolosImageProcessor(BaseImageProcessor):
 
     def __init__(
         self,
-        format: Union[str, AnnotionFormat] = AnnotionFormat.COCO_DETECTION,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
         do_resize: bool = True,
         size: Dict[str, int] = None,
         resample: PILImageResampling = PILImageResampling.BILINEAR,
@@ -768,7 +764,7 @@ def prepare_annotation(
         self,
         image: np.ndarray,
         target: Dict,
-        format: Optional[AnnotionFormat] = None,
+        format: Optional[AnnotationFormat] = None,
         return_segmentation_masks: bool = None,
         masks_path: Optional[Union[str, pathlib.Path]] = None,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -778,12 +774,12 @@ def prepare_annotation(
         """
         format = format if format is not None else self.format
 
-        if format == AnnotionFormat.COCO_DETECTION:
+        if format == AnnotationFormat.COCO_DETECTION:
             return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_detection_annotation(
                 image, target, return_segmentation_masks, input_data_format=input_data_format
             )
-        elif format == AnnotionFormat.COCO_PANOPTIC:
+        elif format == AnnotationFormat.COCO_PANOPTIC:
             return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
             target = prepare_coco_panoptic_annotation(
                 image,
@@ -1023,7 +1019,7 @@ def preprocess(
         image_mean: Optional[Union[float, List[float]]] = None,
         image_std: Optional[Union[float, List[float]]] = None,
         do_pad: Optional[bool] = None,
-        format: Optional[Union[str, AnnotionFormat]] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
         return_tensors: Optional[Union[TensorType, str]] = None,
         data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
         input_data_format: Optional[Union[str, ChannelDimension]] = None,
@@ -1037,12 +1033,12 @@ def preprocess(
                 Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
                 from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
             annotations (`AnnotationType` or `List[AnnotationType]`, *optional*):
-                List of annotations associated with the image or batch of images. If annotionation is for object
+                List of annotations associated with the image or batch of images. If annotation is for object
                 detection, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a
                   dictionary. An image can have no annotations, in which case the list should be empty.
-                If annotionation is for segmentation, the annotations should be a dictionary with the following keys:
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
                 - "image_id" (`int`): The image id.
                 - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary.
                   An image can have no segments, in which case the list should be empty.
@@ -1069,7 +1065,7 @@ def preprocess(
                 Standard deviation to use when normalizing the image.
             do_pad (`bool`, *optional*, defaults to self.do_pad):
                 Whether to pad the image.
-            format (`str` or `AnnotionFormat`, *optional*, defaults to self.format):
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
                 Format of the annotations.
             return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
                 Type of tensors to return. If `None`, will return the list of images.
@@ -1133,28 +1129,13 @@ def preprocess(
                 "torch.Tensor, tf.Tensor or jax.ndarray."
             )
 
-        format = AnnotionFormat(format)
+        format = AnnotationFormat(format)
         if annotations is not None:
-            if format == AnnotionFormat.COCO_DETECTION and not valid_coco_detection_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO detection annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id` and `annotations`, with the latter "
-                    "being a list of annotations in the COCO format."
-                )
-            elif format == AnnotionFormat.COCO_PANOPTIC and not valid_coco_panoptic_annotations(annotations):
-                raise ValueError(
-                    "Invalid COCO panoptic annotations. Annotations must a dict (single image) of list of dicts "
-                    "(batch of images) with the following keys: `image_id`, `file_name` and `segments_info`, with "
-                    "the latter being a list of annotations in the COCO format."
-                )
-            elif format not in SUPPORTED_ANNOTATION_FORMATS:
-                raise ValueError(
-                    f"Unsupported annotation format: {format} must be one of {SUPPORTED_ANNOTATION_FORMATS}"
-                )
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
 
         if (
             masks_path is not None
-            and format == AnnotionFormat.COCO_PANOPTIC
+            and format == AnnotationFormat.COCO_PANOPTIC
             and not isinstance(masks_path, (pathlib.Path, str))
         ):
             raise ValueError(
diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py
index b2f0ca0db532e7..65ffbfced4e85c 100755
--- a/src/transformers/models/yolos/modeling_yolos.py
+++ b/src/transformers/models/yolos/modeling_yolos.py
@@ -756,7 +756,7 @@ def forward(
         >>> inputs = image_processor(images=image, return_tensors="pt")
         >>> outputs = model(**inputs)
 
-        >>> # convert outputs (bounding boxes and class logits) to COCO API
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
         >>> target_sizes = torch.tensor([image.size[::-1]])
         >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
         ...     0
diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py
index 124813b22abbc5..b3861b371a2393 100644
--- a/src/transformers/optimization.py
+++ b/src/transformers/optimization.py
@@ -53,19 +53,22 @@ def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1):
     return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch)
 
 
-def get_reduce_on_plateau_schedule(optimizer: Optimizer):
+def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs):
     """
     Create a schedule with a constant learning rate that decreases when a metric has stopped improving.
 
     Args:
         optimizer ([`~torch.optim.Optimizer`]):
             The optimizer for which to schedule the learning rate.
+        kwargs (`dict`, *optional*):
+            Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau`
+            for possible parameters.
 
     Return:
         `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule.
     """
 
-    return ReduceLROnPlateau(optimizer)
+    return ReduceLROnPlateau(optimizer, **kwargs)
 
 
 def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int):
@@ -359,9 +362,15 @@ def get_scheduler(
     """
     name = SchedulerType(name)
     schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name]
-    if name == SchedulerType.CONSTANT or name == SchedulerType.REDUCE_ON_PLATEAU:
+    if name == SchedulerType.CONSTANT:
         return schedule_func(optimizer)
 
+    if scheduler_specific_kwargs is None:
+        scheduler_specific_kwargs = {}
+
+    if name == SchedulerType.REDUCE_ON_PLATEAU:
+        return schedule_func(optimizer, **scheduler_specific_kwargs)
+
     # All other schedulers require `num_warmup_steps`
     if num_warmup_steps is None:
         raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.")
@@ -376,9 +385,6 @@ def get_scheduler(
     if num_training_steps is None:
         raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.")
 
-    if scheduler_specific_kwargs is None:
-        scheduler_specific_kwargs = {}
-
     return schedule_func(
         optimizer,
         num_warmup_steps=num_warmup_steps,
diff --git a/src/transformers/pipelines/automatic_speech_recognition.py b/src/transformers/pipelines/automatic_speech_recognition.py
index b464bfb4092558..8fd1701d3ca52a 100644
--- a/src/transformers/pipelines/automatic_speech_recognition.py
+++ b/src/transformers/pipelines/automatic_speech_recognition.py
@@ -508,9 +508,19 @@ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
             ):
                 yield item
         else:
-            processed = self.feature_extractor(
-                inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
-            )
+            if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
+                processed = self.feature_extractor(
+                    inputs,
+                    sampling_rate=self.feature_extractor.sampling_rate,
+                    truncation=False,
+                    padding="longest",
+                    return_tensors="pt",
+                )
+            else:
+                processed = self.feature_extractor(
+                    inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+                )
+
             if self.torch_dtype is not None:
                 processed = processed.to(dtype=self.torch_dtype)
             if stride is not None:
@@ -551,8 +561,12 @@ def _forward(self, model_inputs, return_timestamps=False, generate_kwargs=None):
                     if stride is not None:
                         generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
 
+            if self.type == "seq2seq_whisper" and inputs.shape[-1] > self.feature_extractor.nb_max_frames:
+                generate_kwargs["input_features"] = inputs
+            else:
+                generate_kwargs["encoder_outputs"] = encoder(inputs, attention_mask=attention_mask)
+
             tokens = self.model.generate(
-                encoder_outputs=encoder(inputs, attention_mask=attention_mask),
                 attention_mask=attention_mask,
                 **generate_kwargs,
             )
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index a32a9aa9ad8b48..42c5d927079c38 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -503,7 +503,7 @@ def group_sub_entities(self, entities: List[dict]) -> dict:
             entities (`dict`): The entities predicted by the pipeline.
         """
         # Get the first entity in the entity group
-        entity = entities[0]["entity"].split("-")[-1]
+        entity = entities[0]["entity"].split("-", 1)[-1]
         scores = np.nanmean([entity["score"] for entity in entities])
         tokens = [entity["word"] for entity in entities]
 
diff --git a/src/transformers/pytorch_utils.py b/src/transformers/pytorch_utils.py
index d0bc55fe8383ff..1a464b62a66513 100644
--- a/src/transformers/pytorch_utils.py
+++ b/src/transformers/pytorch_utils.py
@@ -30,6 +30,7 @@
 
 is_torch_greater_or_equal_than_2_1 = parsed_torch_version_base >= version.parse("2.1")
 is_torch_greater_or_equal_than_2_0 = parsed_torch_version_base >= version.parse("2.0")
+is_torch_greater_or_equal_than_1_13 = parsed_torch_version_base >= version.parse("1.13")
 is_torch_greater_or_equal_than_1_12 = parsed_torch_version_base >= version.parse("1.12")
 is_torch_greater_or_equal_than_1_11 = parsed_torch_version_base >= version.parse("1.11")
 is_torch_less_than_1_11 = parsed_torch_version_base < version.parse("1.11")
diff --git a/src/transformers/safetensors_conversion.py b/src/transformers/safetensors_conversion.py
new file mode 100644
index 00000000000000..46de0e5755fdf0
--- /dev/null
+++ b/src/transformers/safetensors_conversion.py
@@ -0,0 +1,107 @@
+import json
+import uuid
+from typing import Optional
+
+import requests
+from huggingface_hub import Discussion, HfApi, get_repo_discussions
+
+from .utils import cached_file, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def previous_pr(api: HfApi, model_id: str, pr_title: str, token: str) -> Optional["Discussion"]:
+    main_commit = api.list_repo_commits(model_id, token=token)[0].commit_id
+    for discussion in get_repo_discussions(repo_id=model_id, token=token):
+        if discussion.title == pr_title and discussion.status == "open" and discussion.is_pull_request:
+            commits = api.list_repo_commits(model_id, revision=discussion.git_reference, token=token)
+
+            if main_commit == commits[1].commit_id:
+                return discussion
+    return None
+
+
+def spawn_conversion(token: str, private: bool, model_id: str):
+    logger.info("Attempting to convert .bin model on the fly to safetensors.")
+
+    safetensors_convert_space_url = "https://safetensors-convert.hf.space"
+    sse_url = f"{safetensors_convert_space_url}/queue/join"
+    sse_data_url = f"{safetensors_convert_space_url}/queue/data"
+
+    # The `fn_index` is necessary to indicate to gradio that we will use the `run` method of the Space.
+    hash_data = {"fn_index": 1, "session_hash": str(uuid.uuid4())}
+
+    def start(_sse_connection, payload):
+        for line in _sse_connection.iter_lines():
+            line = line.decode()
+            if line.startswith("data:"):
+                resp = json.loads(line[5:])
+                logger.debug(f"Safetensors conversion status: {resp['msg']}")
+                if resp["msg"] == "queue_full":
+                    raise ValueError("Queue is full! Please try again.")
+                elif resp["msg"] == "send_data":
+                    event_id = resp["event_id"]
+                    response = requests.post(
+                        sse_data_url,
+                        stream=True,
+                        params=hash_data,
+                        json={"event_id": event_id, **payload, **hash_data},
+                    )
+                    response.raise_for_status()
+                elif resp["msg"] == "process_completed":
+                    return
+
+    with requests.get(sse_url, stream=True, params=hash_data) as sse_connection:
+        data = {"data": [model_id, private, token]}
+        try:
+            logger.debug("Spawning safetensors automatic conversion.")
+            start(sse_connection, data)
+        except Exception as e:
+            logger.warning(f"Error during conversion: {repr(e)}")
+
+
+def get_conversion_pr_reference(api: HfApi, model_id: str, **kwargs):
+    private = api.model_info(model_id).private
+
+    logger.info("Attempting to create safetensors variant")
+    pr_title = "Adding `safetensors` variant of this model"
+    token = kwargs.get("token")
+
+    # This looks into the current repo's open PRs to see if a PR for safetensors was already open. If so, it
+    # returns it. It checks that the PR was opened by the bot and not by another user so as to prevent
+    # security breaches.
+    pr = previous_pr(api, model_id, pr_title, token=token)
+
+    if pr is None or (not private and pr.author != "SFConvertBot"):
+        spawn_conversion(token, private, model_id)
+        pr = previous_pr(api, model_id, pr_title, token=token)
+    else:
+        logger.info("Safetensors PR exists")
+
+    sha = f"refs/pr/{pr.num}"
+
+    return sha
+
+
+def auto_conversion(pretrained_model_name_or_path: str, **cached_file_kwargs):
+    api = HfApi(token=cached_file_kwargs.get("token"))
+    sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
+
+    if sha is None:
+        return None, None
+    cached_file_kwargs["revision"] = sha
+    del cached_file_kwargs["_commit_hash"]
+
+    # This is an additional HEAD call that could be removed if we could infer sharded/non-sharded from the PR
+    # description.
+    sharded = api.file_exists(
+        pretrained_model_name_or_path,
+        "model.safetensors.index.json",
+        revision=sha,
+        token=cached_file_kwargs.get("token"),
+    )
+    filename = "model.safetensors.index.json" if sharded else "model.safetensors"
+
+    resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
+    return resolved_archive_file, sha, sharded
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index eb21cbac2303e6..4dcca595a1dc61 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -107,6 +107,7 @@
     is_torch_fp16_available_on_device,
     is_torch_neuroncore_available,
     is_torch_npu_available,
+    is_torch_sdpa_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
@@ -440,6 +441,15 @@ def require_flash_attn(test_case):
     return unittest.skipUnless(is_flash_attn_2_available(), "test requires Flash Attention")(test_case)
 
 
+def require_torch_sdpa(test_case):
+    """
+    Decorator marking a test that requires PyTorch's SDPA.
+
+    These tests are skipped when requirements are not met (torch version).
+    """
+    return unittest.skipUnless(is_torch_sdpa_available(), "test requires PyTorch SDPA")(test_case)
+
+
 def require_peft(test_case):
     """
     Decorator marking a test that requires PEFT.
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
index 723de720a5f939..b9bc0ec54b01fc 100644
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1786,7 +1786,7 @@ def default_chat_template(self):
         """
         logger.warning_once(
             "\nNo chat template is defined for this tokenizer - using a default chat template "
-            "that implements the ChatML format. If the default is not appropriate for "
+            "that implements the ChatML format (without BOS/EOS tokens!). If the default is not appropriate for "
             "your model, please set `tokenizer.chat_template` to an appropriate template. "
             "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
         )
@@ -2235,7 +2235,7 @@ def _from_pretrained(
 
             # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
             # if `tokenizer_config.json` is `None`
-            if "Fast" not in cls.__name__ and tokenizer_file is not None:
+            if tokenizer_file is not None:
                 # This is for slow so can be done before
                 with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
                     tokenizer_file_handle = json.load(tokenizer_file_handle)
@@ -2247,14 +2247,14 @@ def _from_pretrained(
             # end legacy
 
         # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
+        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
+        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
+        init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
         for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
             if added_tokens_map != {} and init_kwargs[key] is not None:
                 if key != "additional_special_tokens":
-                    init_kwargs[key] = added_tokens_map.get(init_kwargs[key], init_kwargs[key])
+                    init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
 
-        init_kwargs["added_tokens_decoder"] = added_tokens_decoder
-        # convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
-        init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
         # Instantiate the tokenizer.
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
@@ -2858,13 +2858,13 @@ def _is_valid_text_input(t):
 
         if not _is_valid_text_input(text):
             raise ValueError(
-                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                 "or `List[List[str]]` (batch of pretokenized examples)."
             )
 
         if text_pair is not None and not _is_valid_text_input(text_pair):
             raise ValueError(
-                "text input must of type `str` (single example), `List[str]` (batch or single pretokenized example) "
+                "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) "
                 "or `List[List[str]]` (batch of pretokenized examples)."
             )
 
diff --git a/src/transformers/tools/agents.py b/src/transformers/tools/agents.py
index 51e3f6db0c25a3..3e423ebb30556d 100644
--- a/src/transformers/tools/agents.py
+++ b/src/transformers/tools/agents.py
@@ -440,13 +440,13 @@ def generate_one(self, prompt, stop):
             return self._completion_generate([prompt], stop)[0]
 
     def _chat_generate(self, prompt, stop):
-        result = openai.ChatCompletion.create(
+        result = openai.chat.completions.create(
             model=self.model,
             messages=[{"role": "user", "content": prompt}],
             temperature=0,
             stop=stop,
         )
-        return result["choices"][0]["message"]["content"]
+        return result.choices[0].message.content
 
     def _completion_generate(self, prompts, stop):
         result = openai.Completion.create(
diff --git a/src/transformers/tools/base.py b/src/transformers/tools/base.py
index 42027948a6f5fa..4042b28ac64c09 100644
--- a/src/transformers/tools/base.py
+++ b/src/transformers/tools/base.py
@@ -601,7 +601,7 @@ def fn(*args, **kwargs):
 def get_default_device():
     logger.warning(
         "`get_default_device` is deprecated and will be replaced with `accelerate`'s `PartialState().default_device` "
-        "in version 4.36 of 🤗 Transformers. "
+        "in version 4.38 of 🤗 Transformers. "
     )
     if not is_torch_available():
         raise ImportError("Please install torch in order to use this tool.")
diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py
index 25100dad69bcaa..d6ccc4334dd46d 100755
--- a/src/transformers/trainer.py
+++ b/src/transformers/trainer.py
@@ -28,6 +28,7 @@
 import re
 import shutil
 import sys
+import tempfile
 import time
 import warnings
 from collections.abc import Mapping
@@ -48,7 +49,7 @@
 import numpy as np
 import torch
 import torch.distributed as dist
-from huggingface_hub import Repository, create_repo, upload_folder
+from huggingface_hub import ModelCard, create_repo, upload_folder
 from packaging import version
 from torch import nn
 from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
@@ -99,7 +100,6 @@
     BestRun,
     EvalLoopOutput,
     EvalPrediction,
-    FSDPOption,
     HPSearchBackend,
     HubStrategy,
     IntervalStrategy,
@@ -193,15 +193,15 @@
 if is_accelerate_available():
     from accelerate import Accelerator, skip_first_batches
     from accelerate import __version__ as accelerate_version
-    from accelerate.utils import DistributedDataParallelKwargs, GradientAccumulationPlugin
-
-    if version.parse(accelerate_version) > version.parse("0.20.3"):
-        from accelerate.utils import (
-            load_fsdp_model,
-            load_fsdp_optimizer,
-            save_fsdp_model,
-            save_fsdp_optimizer,
-        )
+    from accelerate.utils import (
+        DistributedDataParallelKwargs,
+        GradientAccumulationPlugin,
+        load_fsdp_model,
+        load_fsdp_optimizer,
+        save_fsdp_model,
+        save_fsdp_optimizer,
+    )
+
     DATA_SAMPLERS = [RandomSampler]
     if version.parse(accelerate_version) > version.parse("0.23.0"):
         from accelerate.data_loader import SeedableRandomSampler
@@ -226,6 +226,7 @@
 OPTIMIZER_NAME_BIN = "optimizer.bin"
 SCHEDULER_NAME = "scheduler.pt"
 SCALER_NAME = "scaler.pt"
+FSDP_MODEL_NAME = "pytorch_model_fsdp"
 
 
 class Trainer:
@@ -415,7 +416,7 @@ def __init__(
                 " model, please make sure that you have installed `bitsandbytes>=0.37.0`. "
             )
 
-        self.fsdp = None
+        self.is_fsdp_xla_enabled = args.fsdp_config["xla"]
         if len(args.fsdp) > 0:
             if self.is_deepspeed_enabled:
                 raise ValueError(
@@ -424,32 +425,6 @@ def __init__(
             if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED:
                 raise ValueError("Using fsdp only works in distributed training.")
 
-            # dep_version_check("torch>=1.12.0")
-            # Would have to update setup.py with torch>=1.12.0
-            # which isn't ideally given that it will force people not using FSDP to also use torch>=1.12.0
-            # below is the current alternative.
-            if version.parse(version.parse(torch.__version__).base_version) < version.parse("1.12.0"):
-                raise ValueError("FSDP requires PyTorch >= 1.12.0")
-
-            from torch.distributed.fsdp.fully_sharded_data_parallel import BackwardPrefetch, ShardingStrategy
-
-            if FSDPOption.FULL_SHARD in args.fsdp:
-                self.fsdp = ShardingStrategy.FULL_SHARD
-            elif FSDPOption.SHARD_GRAD_OP in args.fsdp:
-                self.fsdp = ShardingStrategy.SHARD_GRAD_OP
-            elif FSDPOption.NO_SHARD in args.fsdp:
-                self.fsdp = ShardingStrategy.NO_SHARD
-
-            self.backward_prefetch = BackwardPrefetch.BACKWARD_PRE
-            if "backward_prefetch" in self.args.fsdp_config and "backward_post" in self.args.fsdp_config.get(
-                "backward_prefetch", []
-            ):
-                self.backward_prefetch = BackwardPrefetch.BACKWARD_POST
-
-            self.limit_all_gathers = False
-            if self.args.fsdp_config.get("limit_all_gathers", False):
-                self.limit_all_gathers = True
-
         # one place to sort out whether to place the model on device or not
         # postpone switching model to cuda when:
         # 1. MP - since we are trying to fit a much bigger than 1 gpu model
@@ -462,7 +437,7 @@ def __init__(
             self.is_model_parallel
             or self.is_deepspeed_enabled
             or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train)
-            or (self.fsdp is not None)
+            or self.is_fsdp_xla_enabled
             or self.is_fsdp_enabled
         ):
             self.place_model_on_device = False
@@ -513,7 +488,7 @@ def __init__(
                     " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and"
                     " `model.to(xm.xla_device())` is performed before the optimizer creation in your script."
                 )
-        if (self.is_deepspeed_enabled or (self.fsdp is not None)) and (
+        if (self.is_deepspeed_enabled or self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and (
             self.optimizer is not None or self.lr_scheduler is not None
         ):
             raise RuntimeError(
@@ -621,7 +596,6 @@ def __init__(
         # returned to 0 every time flos need to be logged
         self.current_flos = 0
         self.hp_search_backend = None
-        self.use_tune_checkpoints = False
         default_label_names = find_labels(self.model.__class__)
         self.label_names = default_label_names if self.args.label_names is None else self.args.label_names
         self.can_return_loss = can_return_loss(self.model.__class__)
@@ -676,38 +650,38 @@ def _deactivate_neftune(self, model):
 
     def add_callback(self, callback):
         """
-        Add a callback to the current list of [`~transformer.TrainerCallback`].
+        Add a callback to the current list of [`~transformers.TrainerCallback`].
 
         Args:
-           callback (`type` or [`~transformer.TrainerCallback`]):
-               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+           callback (`type` or [`~transformers.TrainerCallback`]):
+               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
                first case, will instantiate a member of that class.
         """
         self.callback_handler.add_callback(callback)
 
     def pop_callback(self, callback):
         """
-        Remove a callback from the current list of [`~transformer.TrainerCallback`] and returns it.
+        Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it.
 
         If the callback is not found, returns `None` (and no error is raised).
 
         Args:
-           callback (`type` or [`~transformer.TrainerCallback`]):
-               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+           callback (`type` or [`~transformers.TrainerCallback`]):
+               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
                first case, will pop the first member of that class found in the list of callbacks.
 
         Returns:
-            [`~transformer.TrainerCallback`]: The callback removed, if found.
+            [`~transformers.TrainerCallback`]: The callback removed, if found.
         """
         return self.callback_handler.pop_callback(callback)
 
     def remove_callback(self, callback):
         """
-        Remove a callback from the current list of [`~transformer.TrainerCallback`].
+        Remove a callback from the current list of [`~transformers.TrainerCallback`].
 
         Args:
-           callback (`type` or [`~transformer.TrainerCallback`]):
-               A [`~transformer.TrainerCallback`] class or an instance of a [`~transformer.TrainerCallback`]. In the
+           callback (`type` or [`~transformers.TrainerCallback`]):
+               A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the
                first case, will remove the first member of that class found in the list of callbacks.
         """
         self.callback_handler.remove_callback(callback)
@@ -721,7 +695,10 @@ def _move_model_to_device(self, model, device):
     def _set_signature_columns_if_needed(self):
         if self._signature_columns is None:
             # Inspect model forward signature to keep only the arguments it accepts.
-            signature = inspect.signature(self.model.forward)
+            model_to_inspect = self.model
+            if is_peft_available() and isinstance(self.model, PeftModel):
+                model_to_inspect = self.model.get_base_model()
+            signature = inspect.signature(model_to_inspect.forward)
             self._signature_columns = list(signature.parameters.keys())
             # Labels may be named label or label_ids, the default data collator handles that.
             self._signature_columns += list(set(["label", "label_ids"] + self.label_names))
@@ -819,6 +796,7 @@ def get_train_dataloader(self) -> DataLoader:
             "collate_fn": data_collator,
             "num_workers": self.args.dataloader_num_workers,
             "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
         }
 
         if not isinstance(train_dataset, torch.utils.data.IterableDataset):
@@ -876,6 +854,7 @@ def get_eval_dataloader(self, eval_dataset: Optional[Dataset] = None) -> DataLoa
             "collate_fn": data_collator,
             "num_workers": self.args.dataloader_num_workers,
             "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
         }
 
         if not isinstance(eval_dataset, torch.utils.data.IterableDataset):
@@ -907,6 +886,7 @@ def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
             "collate_fn": data_collator,
             "num_workers": self.args.dataloader_num_workers,
             "pin_memory": self.args.dataloader_pin_memory,
+            "persistent_workers": self.args.dataloader_persistent_workers,
         }
 
         if not isinstance(test_dataset, torch.utils.data.IterableDataset):
@@ -1137,7 +1117,7 @@ def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optim
                 optimizer=self.optimizer if optimizer is None else optimizer,
                 num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                 num_training_steps=num_training_steps,
-                **self.args.lr_scheduler_kwargs,
+                scheduler_specific_kwargs=self.args.lr_scheduler_kwargs,
             )
             self._created_lr_scheduler = True
         return self.lr_scheduler
@@ -1224,7 +1204,8 @@ def _hp_search_setup(self, trial: Union["optuna.Trial", Dict[str, Any]]):
     def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], step: int, metrics: Dict[str, float]):
         if self.hp_search_backend is None or trial is None:
             return
-        self.objective = self.compute_objective(metrics.copy())
+        metrics = metrics.copy()
+        self.objective = self.compute_objective(metrics)
         if self.hp_search_backend == HPSearchBackend.OPTUNA:
             import optuna
 
@@ -1234,24 +1215,23 @@ def _report_to_hp_search(self, trial: Union["optuna.Trial", Dict[str, Any]], ste
                     self.callback_handler.on_train_end(self.args, self.state, self.control)
                     raise optuna.TrialPruned()
         elif self.hp_search_backend == HPSearchBackend.RAY:
-            from ray import tune
-
-            if self.control.should_save:
-                self._tune_save_checkpoint()
-            tune.report(objective=self.objective, **metrics)
-
-    def _tune_save_checkpoint(self):
-        from ray import tune
-
-        if not self.use_tune_checkpoints:
-            return
-        with tune.checkpoint_dir(step=self.state.global_step) as checkpoint_dir:
-            output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
-            self.save_model(output_dir, _internal_call=True)
-            if self.args.should_save:
-                self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
-                torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+            import ray.train
+
+            with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                checkpoint = None
+                if self.control.should_save:
+                    self._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir)
+                    checkpoint = ray.train.Checkpoint.from_directory(temp_checkpoint_dir)
+                metrics["objective"] = self.objective
+                ray.train.report(metrics, checkpoint=checkpoint)
+
+    def _tune_save_checkpoint(self, checkpoint_dir: str):
+        output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}")
+        self.save_model(output_dir, _internal_call=True)
+        if self.args.should_save:
+            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
 
     def call_model_init(self, trial=None):
         model_init_argcount = number_of_arguments(self.model_init)
@@ -1367,7 +1347,7 @@ def _wrap_model(self, model, training=True, dataloader=None):
 
         # Distributed training (should be after apex fp16 initialization)
         # Distributed training using PyTorch FSDP
-        if self.fsdp is not None and self.args.fsdp_config["xla"]:
+        if self.is_fsdp_xla_enabled:
             try:
                 from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP
                 from torch_xla.distributed.fsdp import checkpoint_module
@@ -1530,6 +1510,10 @@ def train(
             and not self.is_fsdp_enabled
         ):
             self._load_from_checkpoint(resume_from_checkpoint)
+            # In case of repeating the find_executable_batch_size, set `self._train_batch_size` properly
+            state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME))
+            if state.train_batch_size is not None:
+                self._train_batch_size = state.train_batch_size
 
         # If model was re-initialized, put it on the right device and update self.model_wrapped
         if model_reloaded:
@@ -1565,6 +1549,8 @@ def _inner_training_loop(
     ):
         self.accelerator.free_memory()
         self._train_batch_size = batch_size
+        if self.args.auto_find_batch_size:
+            self.state.train_batch_size = self._train_batch_size
         logger.debug(f"Currently training with a batch size of: {self._train_batch_size}")
         # Data loader and number of training steps
         train_dataloader = self.get_train_dataloader()
@@ -1621,12 +1607,12 @@ def _inner_training_loop(
                 # references registered here no longer work on other gpus, breaking the module
                 raise ValueError(
                     "Currently --debug underflow_overflow is not supported under DP. Please use DDP"
-                    " (torch.distributed.launch)."
+                    " (torchrun or torch.distributed.launch (deprecated))."
                 )
             else:
                 debug_overflow = DebugUnderflowOverflow(self.model)  # noqa
 
-        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.fsdp is not None or self.is_fsdp_enabled
+        delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled
 
         # We need to reset the scheduler, as its parameters may be different on subsequent calls
         if self._created_lr_scheduler:
@@ -1641,6 +1627,7 @@ def _inner_training_loop(
 
         self.state = TrainerState()
         self.state.is_hyper_param_search = trial is not None
+        self.state.train_batch_size = self._train_batch_size
 
         # Compute absolute values for logging, eval, and save if given as ratio
         if args.logging_steps is not None:
@@ -1676,8 +1663,6 @@ def _inner_training_loop(
         use_accelerator_prepare = True if model is self.model else False
 
         if delay_optimizer_creation:
-            if use_accelerator_prepare:
-                self.model = self.accelerator.prepare(self.model)
             self.create_optimizer_and_scheduler(num_training_steps=max_steps)
 
         # prepare using `accelerator` prepare
@@ -1895,9 +1880,7 @@ def _inner_training_loop(
                 ):
                     # the `or` condition of `is_last_step_and_steps_less_than_grad_acc` is not covered
                     # in accelerate. So, explicitly enable sync gradients to True in that case.
-                    if is_last_step_and_steps_less_than_grad_acc or (
-                        version.parse(accelerate_version) <= version.parse("0.20.3")
-                    ):
+                    if is_last_step_and_steps_less_than_grad_acc:
                         self.accelerator.gradient_state._set_sync_gradients(True)
 
                     # Gradient clipping
@@ -2024,9 +2007,9 @@ def _get_output_dir(self, trial):
             if self.hp_search_backend == HPSearchBackend.OPTUNA:
                 run_id = trial.number
             elif self.hp_search_backend == HPSearchBackend.RAY:
-                from ray import tune
+                import ray.train
 
-                run_id = tune.get_trial_id()
+                run_id = ray.train.get_context().get_trial_id()
             elif self.hp_search_backend == HPSearchBackend.SIGOPT:
                 run_id = trial.id
             elif self.hp_search_backend == HPSearchBackend.WANDB:
@@ -2051,7 +2034,7 @@ def _load_from_checkpoint(self, resume_from_checkpoint, model=None):
         safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME)
         safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME)
         is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and any(
-            WEIGHTS_NAME.split(".")[0] in folder_name
+            FSDP_MODEL_NAME in folder_name
             for folder_name in os.listdir(resume_from_checkpoint)
             if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name))
         )
@@ -2359,57 +2342,21 @@ def _save_checkpoint(self, model, trial, metrics=None):
 
         run_dir = self._get_output_dir(trial=trial)
         output_dir = os.path.join(run_dir, checkpoint_folder)
-        self.save_model(output_dir, _internal_call=True)
-        if self.is_deepspeed_enabled:
-            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
-            # config `stage3_gather_16bit_weights_on_model_save` is True
-            self.model_wrapped.save_checkpoint(output_dir)
-
-        # Save optimizer and scheduler
-        if self.fsdp or self.is_fsdp_enabled:
-            if self.is_fsdp_enabled:
-                save_fsdp_optimizer(
-                    self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
-                )
-            else:
-                # FSDP has a different interface for saving optimizer states.
-                # Needs to be called on all ranks to gather all states.
-                # full_optim_state_dict will be deprecated after Pytorch 2.2!
-                full_osd = self.model.__class__.full_optim_state_dict(self.model, self.optimizer)
-                torch.save(full_osd, os.path.join(output_dir, OPTIMIZER_NAME))
-
-        if is_torch_tpu_available():
-            xm.rendezvous("saving_optimizer_states")
-            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-                reissue_pt_warnings(caught_warnings)
-        elif is_sagemaker_mp_enabled():
-            opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
-            smp.barrier()
-            if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
-                smp.save(
-                    opt_state_dict,
-                    os.path.join(output_dir, OPTIMIZER_NAME),
-                    partial=True,
-                    v3=smp.state.cfg.shard_optimizer_state,
-                )
-        elif self.args.should_save and not self.is_deepspeed_enabled and not (self.fsdp or self.is_fsdp_enabled):
-            # deepspeed.save_checkpoint above saves model/optim/sched
-            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+        if os.path.exists(output_dir) and len(os.listdir(output_dir)) > 0:
+            logger.warning(
+                f"Checkpoint destination directory {output_dir} already exists and is non-empty."
+                "Saving will proceed but saved results may be invalid."
+            )
+            staging_output_dir = output_dir
+        else:
+            staging_output_dir = os.path.join(run_dir, f"tmp-{checkpoint_folder}")
+        self.save_model(staging_output_dir, _internal_call=True)
 
-        # Save SCHEDULER & SCALER
-        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
-            self.lr_scheduler, DeepSpeedSchedulerWrapper
-        )
-        if (
-            self.args.should_save
-            and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
-            and not is_torch_tpu_available()
-        ):
-            with warnings.catch_warnings(record=True) as caught_warnings:
-                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
-            reissue_pt_warnings(caught_warnings)
+        if not self.args.save_only_model:
+            # Save optimizer and scheduler
+            self._save_optimizer_and_scheduler(staging_output_dir)
+            # Save RNG state
+            self._save_rng_state(staging_output_dir)
 
         # Determine the new best metric / best model checkpoint
         if metrics is not None and self.args.metric_for_best_model is not None:
@@ -2429,8 +2376,20 @@ def _save_checkpoint(self, model, trial, metrics=None):
 
         # Save the Trainer state
         if self.args.should_save:
-            self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME))
+            self.state.save_to_json(os.path.join(staging_output_dir, TRAINER_STATE_NAME))
+
+        if self.args.push_to_hub:
+            self._push_from_checkpoint(staging_output_dir)
+
+        # Place checkpoint in final location after all saving is finished.
+        if staging_output_dir != output_dir:
+            os.rename(staging_output_dir, output_dir)
+
+        # Maybe delete some older checkpoints.
+        if self.args.should_save:
+            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
 
+    def _save_rng_state(self, output_dir):
         # Save RNG state in non-distributed training
         rng_states = {
             "python": random.getstate(),
@@ -2462,12 +2421,49 @@ def _save_checkpoint(self, model, trial, metrics=None):
         else:
             torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth"))
 
-        if self.args.push_to_hub:
-            self._push_from_checkpoint(output_dir)
+    def _save_optimizer_and_scheduler(self, output_dir):
+        if is_torch_tpu_available():
+            xm.rendezvous("saving_optimizer_states")
+            xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+                reissue_pt_warnings(caught_warnings)
+        elif is_sagemaker_mp_enabled():
+            opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False)
+            smp.barrier()
+            if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state:
+                smp.save(
+                    opt_state_dict,
+                    os.path.join(output_dir, OPTIMIZER_NAME),
+                    partial=True,
+                    v3=smp.state.cfg.shard_optimizer_state,
+                )
+        elif self.is_deepspeed_enabled:
+            # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
+            # config `stage3_gather_16bit_weights_on_model_save` is True
+            self.model_wrapped.save_checkpoint(output_dir)
+        elif self.is_fsdp_enabled:
+            # save fsdp specific ckpt for resuming from ckpt
+            save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
+            save_fsdp_optimizer(
+                self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir
+            )
+        elif self.args.should_save:
+            # deepspeed.save_checkpoint above saves model/optim/sched
+            torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME))
 
-        # Maybe delete some older checkpoints.
-        if self.args.should_save:
-            self._rotate_checkpoints(use_mtime=True, output_dir=run_dir)
+        # Save SCHEDULER & SCALER
+        is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance(
+            self.lr_scheduler, DeepSpeedSchedulerWrapper
+        )
+        if (
+            self.args.should_save
+            and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler)
+            and not is_torch_tpu_available()
+        ):
+            with warnings.catch_warnings(record=True) as caught_warnings:
+                torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME))
+            reissue_pt_warnings(caught_warnings)
 
     def _load_optimizer_and_scheduler(self, checkpoint):
         """If optimizer and scheduler states exist, load them."""
@@ -2535,23 +2531,14 @@ def opt_load_hook(mod, opt):
                     # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more
                     # likely to get OOM on CPU (since we load num_gpu times the optimizer state
                     map_location = self.args.device if self.args.world_size > 1 else "cpu"
-                    if self.fsdp or self.is_fsdp_enabled:
-                        if self.is_fsdp_enabled:
-                            load_fsdp_optimizer(
-                                self.accelerator.state.fsdp_plugin,
-                                self.accelerator,
-                                self.optimizer,
-                                self.model,
-                                checkpoint,
-                            )
-                        else:
-                            full_osd = None
-                            # In FSDP, we need to load the full optimizer state dict on rank 0 and then shard it
-                            if self.args.process_index == 0:
-                                full_osd = torch.load(os.path.join(checkpoint, OPTIMIZER_NAME))
-                            # call scatter_full_optim_state_dict on all ranks
-                            sharded_osd = self.model.__class__.scatter_full_optim_state_dict(full_osd, self.model)
-                            self.optimizer.load_state_dict(sharded_osd)
+                    if self.is_fsdp_enabled:
+                        load_fsdp_optimizer(
+                            self.accelerator.state.fsdp_plugin,
+                            self.accelerator,
+                            self.optimizer,
+                            self.model,
+                            checkpoint,
+                        )
                     else:
                         self.optimizer.load_state_dict(
                             torch.load(os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location)
@@ -2826,19 +2813,14 @@ def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = Fa
             if IS_SAGEMAKER_MP_POST_1_10:
                 # 'user_content.pt' indicates model state_dict saved with smp >= 1.10
                 Path(os.path.join(output_dir, "user_content.pt")).touch()
-        elif self.fsdp is not None or self.is_fsdp_enabled:
-            state_dict = self.model.state_dict() if not self.is_fsdp_enabled else {}
-            if self.args.should_save:
-                self._save(output_dir, state_dict=state_dict)
-            if self.is_fsdp_enabled:
-                # remove the dummy state_dict
-                remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME])
-                save_fsdp_model(self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir)
-
+        elif self.is_fsdp_enabled:
+            if ("FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)) and (
+                version.parse(accelerate_version) > version.parse("0.24.1")
+            ):
+                state_dict = self.accelerator.get_state_dict(self.model)
+                if self.args.should_save:
+                    self._save(output_dir, state_dict=state_dict)
         elif self.is_deepspeed_enabled:
-            # this takes care of everything as long as we aren't under zero3
-            if version.parse(accelerate_version) <= version.parse("0.20.3"):
-                raise ValueError("Install Accelerate from main branch")
             try:
                 state_dict = self.accelerator.get_state_dict(self.deepspeed)
                 if self.args.should_save:
@@ -2874,7 +2856,7 @@ def _save_tpu(self, output_dir: Optional[str] = None):
         xm.rendezvous("saving_checkpoint")
         if not isinstance(self.model, PreTrainedModel):
             if isinstance(unwrap_model(self.model), PreTrainedModel):
-                unwrap_model(self.model).save_pretrained(
+                unwrap_model(self.model).to("cpu").save_pretrained(
                     output_dir,
                     is_main_process=self.args.should_save,
                     state_dict=self.model.state_dict(),
@@ -2882,10 +2864,12 @@ def _save_tpu(self, output_dir: Optional[str] = None):
                 )
             else:
                 logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
-                state_dict = self.model.state_dict()
+                state_dict = self.model.state_dict().to("cpu")
                 xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
         else:
-            self.model.save_pretrained(output_dir, is_main_process=self.args.should_save, save_function=xm.save)
+            self.model.to("cpu").save_pretrained(
+                output_dir, is_main_process=self.args.should_save, save_function=xm.save
+            )
         if self.tokenizer is not None and self.args.should_save:
             self.tokenizer.save_pretrained(output_dir)
 
@@ -3247,11 +3231,7 @@ def evaluation_loop(
             self.control = self.callback_handler.on_prediction_step(args, self.state, self.control)
 
             # Gather all tensors and put them back on the CPU if we have done enough accumulation steps.
-            if (
-                args.eval_accumulation_steps is not None
-                and (step + 1) % args.eval_accumulation_steps == 0
-                and (self.accelerator.sync_gradients or version.parse(accelerate_version) > version.parse("0.20.3"))
-            ):
+            if args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0:
                 if losses_host is not None:
                     losses = nested_numpify(losses_host)
                     all_losses = losses if all_losses is None else np.concatenate((all_losses, losses), axis=0)
@@ -3496,63 +3476,6 @@ def init_hf_repo(self):
         self.hub_model_id = repo_url.repo_id
         self.push_in_progress = None
 
-    def init_git_repo(self, at_init: bool = False):
-        """
-        Initializes a git repo in `self.args.hub_model_id`.
-
-        <Tip warning={true}>
-
-        This function is deprecated and will be removed in v4.34.0 of Transformers.
-
-        </Tip>
-
-        Args:
-            at_init (`bool`, *optional*, defaults to `False`):
-                Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
-                `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
-                out.
-        """
-        warnings.warn(
-            "`Trainer.init_git_repo` is deprecated and will be removed in v4.34.0 of Transformers. Use "
-            "`Trainer.init_hf_repo` instead."
-        )
-        if not self.is_world_process_zero():
-            return
-
-        # Make sure the repo exists + retrieve "real" repo_id
-        repo_name = self.args.hub_model_id
-        if repo_name is None:
-            repo_name = Path(self.args.output_dir).absolute().name
-        repo_id = create_repo(
-            repo_id=repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True
-        ).repo_id
-
-        try:
-            self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
-        except EnvironmentError:
-            if self.args.overwrite_output_dir and at_init:
-                # Try again after wiping output_dir
-                shutil.rmtree(self.args.output_dir)
-                self.repo = Repository(self.args.output_dir, clone_from=repo_id, token=self.args.hub_token)
-            else:
-                raise
-
-        self.repo.git_pull()
-
-        # By default, ignore the checkpoint folders
-        if (
-            not os.path.exists(os.path.join(self.args.output_dir, ".gitignore"))
-            and self.args.hub_strategy != HubStrategy.ALL_CHECKPOINTS
-        ):
-            with open(os.path.join(self.args.output_dir, ".gitignore"), "w", encoding="utf-8") as writer:
-                writer.writelines(["checkpoint-*/"])
-
-        # Add "*.sagemaker" to .gitignore if using SageMaker
-        if os.environ.get("SM_TRAINING_ENV"):
-            self._add_sm_patterns_to_gitignore()
-
-        self.push_in_progress = None
-
     def create_model_card(
         self,
         language: Optional[str] = None,
@@ -3593,6 +3516,12 @@ def create_model_card(
         if not self.is_world_process_zero():
             return
 
+        model_card_filepath = os.path.join(self.args.output_dir, "README.md")
+        is_peft_library = False
+        if os.path.exists(model_card_filepath):
+            library_name = ModelCard.load(model_card_filepath).data.get("library_name")
+            is_peft_library = library_name == "peft"
+
         training_summary = TrainingSummary.from_trainer(
             self,
             language=language,
@@ -3606,9 +3535,12 @@ def create_model_card(
             dataset_args=dataset_args,
         )
         model_card = training_summary.to_model_card()
-        with open(os.path.join(self.args.output_dir, "README.md"), "w") as f:
+        with open(model_card_filepath, "w") as f:
             f.write(model_card)
 
+        if is_peft_library:
+            unwrap_model(self.model).create_or_update_model_card(self.args.output_dir)
+
     def _push_from_checkpoint(self, checkpoint_folder):
         # Only push from one node.
         if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END:
@@ -3934,8 +3866,7 @@ def _add_sm_patterns_to_gitignore(self) -> None:
 
     def create_accelerator_and_postprocess(self):
         grad_acc_kwargs = {"num_steps": self.args.gradient_accumulation_steps}
-        if version.parse(accelerate_version) > version.parse("0.20.3"):
-            grad_acc_kwargs["sync_with_dataloader"] = False
+        grad_acc_kwargs["sync_with_dataloader"] = False
         gradient_accumulation_plugin = GradientAccumulationPlugin(**grad_acc_kwargs)
 
         # create accelerator object
diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py
index 5e0c75a1b841f4..7533d7219c19db 100644
--- a/src/transformers/trainer_callback.py
+++ b/src/transformers/trainer_callback.py
@@ -59,6 +59,9 @@ class TrainerState:
             Run an evaluation every X steps.
         save_steps (`int`, *optional*, defaults to 500):
             Save checkpoint every X updates steps.
+        train_batch_size (`int`, *optional*):
+            The batch size for the training dataloader. Only needed when
+            `auto_find_batch_size` has been used.
         num_input_tokens_seen (`int`, *optional*, defaults to 0):
             The number of tokens seen during training (number of input tokens, not the number of prediction tokens).
         total_flos (`float`, *optional*, defaults to 0):
@@ -88,6 +91,7 @@ class TrainerState:
     logging_steps: int = 500
     eval_steps: int = 500
     save_steps: int = 500
+    train_batch_size: int = None
     num_train_epochs: int = 0
     num_input_tokens_seen: int = 0
     total_flos: float = 0
@@ -207,7 +211,7 @@ class TrainerCallback:
 
     The argument `args`, `state` and `control` are positionals for all events, all the others are grouped in `kwargs`.
     You can unpack the ones you need in the signature of the event using them. As an example, see the code of the
-    simple [`~transformer.PrinterCallback`].
+    simple [`~transformers.PrinterCallback`].
 
     Example:
 
diff --git a/src/transformers/trainer_utils.py b/src/transformers/trainer_utils.py
index e6f26d0df51969..dbd868d1120246 100644
--- a/src/transformers/trainer_utils.py
+++ b/src/transformers/trainer_utils.py
@@ -727,6 +727,8 @@ class FSDPOption(ExplicitEnum):
     FULL_SHARD = "full_shard"
     SHARD_GRAD_OP = "shard_grad_op"
     NO_SHARD = "no_shard"
+    HYBRID_SHARD = "hybrid_shard"
+    HYBRID_SHARD_ZERO2 = "hybrid_shard_zero2"
     OFFLOAD = "offload"
     AUTO_WRAP = "auto_wrap"
 
diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py
index b368d86e0ed8e9..bc1f1bfd506dd8 100644
--- a/src/transformers/training_args.py
+++ b/src/transformers/training_args.py
@@ -234,8 +234,8 @@ class TrainingArguments:
             the last epoch before stopping training).
         max_steps (`int`, *optional*, defaults to -1):
             If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
-            In case of using a finite iterable dataset the training may stop before reaching the set number of steps
-            when all data is exhausted
+            For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
+            `max_steps` is reached.
         lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`):
             The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values.
         lr_scheduler_kwargs ('dict', *optional*, defaults to {}):
@@ -304,6 +304,11 @@ class TrainingArguments:
 
             This should not be activated when the different nodes use the same storage as the files will be saved with
             the same names for each node.
+        save_only_model (`bool`, *optional*, defaults to `False`):
+            When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state.
+            Note that when this is true, you won't be able to resume training from checkpoint.
+            This enables you to save storage by not storing the optimizer, scheduler & rng state.
+            You can only load the model using `from_pretrained` with this option set to `True`.
         use_cpu (`bool`, *optional*, defaults to `False`):
             Whether or not to use cpu. If set to False, we will use cuda or mps device if available.
         seed (`int`, *optional*, defaults to 42):
@@ -418,12 +423,14 @@ class TrainingArguments:
 
             - `"full_shard"`: Shard parameters, gradients and optimizer states.
             - `"shard_grad_op"`: Shard optimizer states and gradients.
+            - `"hybrid_shard"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes.
+            - `"hybrid_shard_zero2"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes.
             - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and
               `"shard_grad_op"`).
             - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`.
         fsdp_config (`str` or `dict`, *optional*):
             Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of
-            deepspeed json config file (e.g., `ds_config.json`) or an already loaded json file as `dict`.
+            fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`.
 
             A List of config and its options:
                 - min_num_params (`int`, *optional*, defaults to `0`):
@@ -452,7 +459,7 @@ class TrainingArguments:
                     FSDP's limit_all_gathers (useful only when `fsdp` field is passed).
                      If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight
                      all-gathers.
-                - use_orig_params (`bool`, *optional*, defaults to `False`)
+                - use_orig_params (`bool`, *optional*, defaults to `True`)
                     If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed
                     frozen and trainable paramteres. Useful in cases such as parameter-efficient fine-tuning. Please
                     refer this
@@ -460,6 +467,10 @@ class TrainingArguments:
                 - sync_module_states (`bool`, *optional*, defaults to `True`)
                     If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to
                     ensure they are the same across all ranks after initialization
+                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
+                    If `"True"`, activation checkpointing is a technique to reduce memory usage by clearing activations of
+                    certain layers and recomputing them during a backward pass. Effectively, this trades extra
+                    computation time for reduced memory usage.
                 - xla (`bool`, *optional*, defaults to `False`):
                     Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature
                     and its API may evolve in the future.
@@ -472,10 +483,6 @@ class TrainingArguments:
                     Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be
                     used when the xla flag is set to true, and an auto wrapping policy is specified through
                     fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap.
-                - activation_checkpointing (`bool`, *optional*, defaults to `False`):
-                    If True, activation checkpointing is a technique to reduce memory usage by clearing activations of
-                    certain layers and recomputing them during a backward pass. Effectively, this trades extra
-                    computation time for reduced memory usage.
 
         deepspeed (`str` or `dict`, *optional*):
             Use [Deepspeed](https://github.com/microsoft/deepspeed). This is an experimental feature and its API may
@@ -522,6 +529,10 @@ class TrainingArguments:
             `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise.
         dataloader_pin_memory (`bool`, *optional*, defaults to `True`):
             Whether you want to pin memory in data loaders or not. Will default to `True`.
+        dataloader_persistent_workers (`bool`, *optional*, defaults to `False`):
+            If True, the data loader will not shut down the worker processes after a dataset has been consumed once.
+            This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will
+            increase RAM usage. Will default to `False`.
         skip_memory_metrics (`bool`, *optional*, defaults to `True`):
             Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows
             down the training and evaluation speed.
@@ -835,6 +846,17 @@ class TrainingArguments:
             )
         },
     )
+    save_only_model: bool = field(
+        default=False,
+        metadata={
+            "help": (
+                "When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state."
+                "Note that when this is true, you won't be able to resume training from checkpoint."
+                "This enables you to save storage by not storing the optimizer, scheduler & rng state."
+                "You can only load the model using from_pretrained with this option set to True."
+            )
+        },
+    )
     no_cuda: bool = field(
         default=False,
         metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."},
@@ -1118,6 +1140,12 @@ class TrainingArguments:
     dataloader_pin_memory: bool = field(
         default=True, metadata={"help": "Whether or not to pin memory for DataLoader."}
     )
+    dataloader_persistent_workers: bool = field(
+        default=False,
+        metadata={
+            "help": "If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will increase RAM usage."
+        },
+    )
     skip_memory_metrics: bool = field(
         default=True, metadata={"help": "Whether or not to skip adding of memory profiler reports to metrics."}
     )
@@ -1670,7 +1698,7 @@ def __post_init__(self):
             os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper()
             os.environ[f"{prefix}FORWARD_PREFETCH"] = self.fsdp_config.get("forward_prefect", "false")
             os.environ[f"{prefix}SYNC_MODULE_STATES"] = self.fsdp_config.get("sync_module_states", "true")
-            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "false")
+            os.environ[f"{prefix}USE_ORIG_PARAMS"] = self.fsdp_config.get("use_orig_params", "true")
 
         if self.tpu_metrics_debug:
             warnings.warn(
@@ -1716,6 +1744,9 @@ def __post_init__(self):
             self.deepspeed_plugin.set_mixed_precision(mixed_precision)
             self.deepspeed_plugin.set_deepspeed_weakref()
 
+        if self.use_cpu:
+            self.dataloader_pin_memory = False
+
         if self.push_to_hub_token is not None:
             warnings.warn(
                 "`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use "
@@ -1858,7 +1889,7 @@ def _setup_devices(self) -> "torch.device":
         elif self.distributed_state.distributed_type == DistributedType.MULTI_XPU:
             if "ACCELERATE_USE_XPU" not in os.environ:
                 os.environ["ACCELERATE_USE_XPU"] = "true"
-            self._n_gpu = torch.xpu.device_count()
+            self._n_gpu = 1
             device = torch.device("xpu:0")
             torch.xpu.set_device(device)
         elif self.distributed_state.distributed_type == DistributedType.NO:
@@ -2181,9 +2212,9 @@ def set_training(
                 Total number of training epochs to perform (if not an integer, will perform the decimal part percents
                 of the last epoch before stopping training).
             max_steps (`int`, *optional*, defaults to -1):
-                If set to a positive number, the total number of training steps to perform. Overrides
-                `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
-                the set number of steps when all data is exhausted.
+                If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+                For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
+                `max_steps` is reached.
             gradient_accumulation_steps (`int`, *optional*, defaults to 1):
                 Number of updates steps to accumulate the gradients for, before performing a backward/update pass.
 
@@ -2588,9 +2619,9 @@ def set_lr_scheduler(
                 Total number of training epochs to perform (if not an integer, will perform the decimal part percents
                 of the last epoch before stopping training).
             max_steps (`int`, *optional*, defaults to -1):
-                If set to a positive number, the total number of training steps to perform. Overrides
-                `num_train_epochs`. In case of using a finite iterable dataset the training may stop before reaching
-                the set number of steps when all data is exhausted.
+                If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+                For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
+                `max_steps` is reached.
             warmup_ratio (`float`, *optional*, defaults to 0.0):
                 Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
             warmup_steps (`int`, *optional*, defaults to 0):
@@ -2622,6 +2653,7 @@ def set_dataloader(
         drop_last: bool = False,
         num_workers: int = 0,
         pin_memory: bool = True,
+        persistent_workers: bool = False,
         auto_find_batch_size: bool = False,
         ignore_data_skip: bool = False,
         sampler_seed: Optional[int] = None,
@@ -2638,6 +2670,10 @@ def set_dataloader(
                 the main process.
             pin_memory (`bool`, *optional*, defaults to `True`):
                 Whether you want to pin memory in data loaders or not. Will default to `True`.
+            persistent_workers (`bool`, *optional*, defaults to `False`):
+                If True, the data loader will not shut down the worker processes after a dataset has been consumed
+                once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training,
+                but will increase RAM usage. Will default to `False`.
             auto_find_batch_size (`bool`, *optional*, defaults to `False`)
                 Whether to find a batch size that will fit into memory automatically through exponential decay,
                 avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`)
@@ -2667,6 +2703,7 @@ def set_dataloader(
         self.dataloader_drop_last = drop_last
         self.dataloader_num_workers = num_workers
         self.dataloader_pin_memory = pin_memory
+        self.dataloader_persistent_workers = persistent_workers
         self.auto_find_batch_size = auto_find_batch_size
         self.ignore_data_skip = ignore_data_skip
         self.data_seed = sampler_seed
diff --git a/src/transformers/training_args_tf.py b/src/transformers/training_args_tf.py
index 461c4086acc341..5a13cc551b696a 100644
--- a/src/transformers/training_args_tf.py
+++ b/src/transformers/training_args_tf.py
@@ -92,6 +92,8 @@ class TFTrainingArguments(TrainingArguments):
             Total number of training epochs to perform.
         max_steps (`int`, *optional*, defaults to -1):
             If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`.
+            For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until
+            `max_steps` is reached.
         warmup_ratio (`float`, *optional*, defaults to 0.0):
             Ratio of total training steps used for a linear warmup from 0 to `learning_rate`.
         warmup_steps (`int`, *optional*, defaults to 0):
diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py
index e7911e5d552a1e..e9e2f9e0403987 100644
--- a/src/transformers/utils/__init__.py
+++ b/src/transformers/utils/__init__.py
@@ -118,6 +118,7 @@
     is_faiss_available,
     is_flash_attn_2_available,
     is_flash_attn_available,
+    is_flash_attn_greater_or_equal_2_10,
     is_flax_available,
     is_fsdp_available,
     is_ftfy_available,
@@ -179,6 +180,7 @@
     is_torch_mps_available,
     is_torch_neuroncore_available,
     is_torch_npu_available,
+    is_torch_sdpa_available,
     is_torch_tensorrt_fx_available,
     is_torch_tf32_available,
     is_torch_tpu_available,
diff --git a/src/transformers/utils/backbone_utils.py b/src/transformers/utils/backbone_utils.py
index 595aae18832c14..2dc1d82470df26 100644
--- a/src/transformers/utils/backbone_utils.py
+++ b/src/transformers/utils/backbone_utils.py
@@ -44,7 +44,7 @@ def verify_out_features_out_indices(
         if not isinstance(out_indices, (list, tuple)):
             raise ValueError(f"out_indices must be a list or tuple, got {type(out_indices)}")
         if any(idx >= len(stage_names) for idx in out_indices):
-            raise ValueError("out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
+            raise ValueError(f"out_indices must be valid indices for stage_names {stage_names}, got {out_indices}")
 
     if out_features is not None and out_indices is not None:
         if len(out_features) != len(out_indices):
diff --git a/src/transformers/utils/doc.py b/src/transformers/utils/doc.py
index ac35542f7c3064..23679f31a3e2ec 100644
--- a/src/transformers/utils/doc.py
+++ b/src/transformers/utils/doc.py
@@ -1075,6 +1075,7 @@ def add_code_sample_docstrings(
     expected_output=None,
     expected_loss=None,
     real_checkpoint=None,
+    revision=None,
 ):
     def docstring_decorator(fn):
         # model_class defaults to function's class if not specified otherwise
@@ -1143,6 +1144,15 @@ def docstring_decorator(fn):
         func_doc = (fn.__doc__ or "") + "".join(docstr)
         output_doc = "" if output_type is None else _prepare_output_docstrings(output_type, config_class)
         built_doc = code_sample.format(**doc_kwargs)
+        if revision is not None:
+            if re.match(r"^refs/pr/\\d+", revision):
+                raise ValueError(
+                    f"The provided revision '{revision}' is incorrect. It should point to"
+                    " a pull request reference on the hub like 'refs/pr/6'"
+                )
+            built_doc = built_doc.replace(
+                f'from_pretrained("{checkpoint}")', f'from_pretrained("{checkpoint}", revision="{revision}")'
+            )
         fn.__doc__ = func_doc + output_doc + built_doc
         return fn
 
diff --git a/src/transformers/utils/dummy_flax_objects.py b/src/transformers/utils/dummy_flax_objects.py
index 4090e4ff5134e1..ecf17e711556cb 100644
--- a/src/transformers/utils/dummy_flax_objects.py
+++ b/src/transformers/utils/dummy_flax_objects.py
@@ -800,6 +800,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["flax"])
 
 
+class FlaxLlamaForCausalLM(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLlamaModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
+class FlaxLlamaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["flax"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["flax"])
+
+
 class FlaxLongT5ForConditionalGeneration(metaclass=DummyObject):
     _backends = ["flax"]
 
diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py
index c6b20c7e36746a..b9b3e9b5807a64 100644
--- a/src/transformers/utils/dummy_pt_objects.py
+++ b/src/transformers/utils/dummy_pt_objects.py
@@ -16,6 +16,27 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class Cache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class DynamicCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SinkCache(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class GlueDataset(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -627,6 +648,12 @@ def __init__(self, *args, **kwargs):
 MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = None
 
 
+MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = None
+
+
+MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = None
+
+
 MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
 
 
@@ -1044,6 +1071,13 @@ def __init__(self, *args, **kwargs):
 BEIT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
+class BeitBackbone(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class BeitForImageClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -2676,6 +2710,48 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class AdaptiveEmbedding(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TransfoXLForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TransfoXLLMHeadModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TransfoXLModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class TransfoXLPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+def load_tf_weights_in_transfo_xl(*args, **kwargs):
+    requires_backends(load_tf_weights_in_transfo_xl, ["torch"])
+
+
 VAN_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -4586,6 +4662,30 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class LlavaForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LlavaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class LlavaProcessor(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 LONGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -5162,6 +5262,34 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+class MixtralForCausalLM(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MixtralForSequenceClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MixtralModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class MixtralPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 MOBILEBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -6019,6 +6147,96 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+PATCHTSMIXER_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PatchTSMixerForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSMixerForPretraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSMixerForRegression(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSMixerForTimeSeriesClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSMixerModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSMixerPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+PATCHTST_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class PatchTSTForClassification(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForPrediction(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForPretraining(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTForRegression(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class PatchTSTPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class PegasusForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -7113,6 +7331,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class SeamlessM4Tv2ForSpeechToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SeamlessM4Tv2ForSpeechToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SeamlessM4Tv2ForTextToSpeech(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SeamlessM4Tv2ForTextToText(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SeamlessM4Tv2Model(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class SeamlessM4Tv2PreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 SEGFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -7739,90 +8002,72 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
+TROCR_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class AdaptiveEmbedding(metaclass=DummyObject):
+class TrOCRForCausalLM(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TransfoXLForSequenceClassification(metaclass=DummyObject):
+class TrOCRPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TransfoXLLMHeadModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
+TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class TransfoXLModel(metaclass=DummyObject):
+class TvltForAudioVisualClassification(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TransfoXLPreTrainedModel(metaclass=DummyObject):
+class TvltForPreTraining(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-def load_tf_weights_in_transfo_xl(*args, **kwargs):
-    requires_backends(load_tf_weights_in_transfo_xl, ["torch"])
-
-
-TROCR_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TrOCRForCausalLM(metaclass=DummyObject):
+class TvltModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TrOCRPreTrainedModel(metaclass=DummyObject):
+class TvltPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-TVLT_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TvltForAudioVisualClassification(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
+TVP_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
-class TvltForPreTraining(metaclass=DummyObject):
+class TvpForVideoGrounding(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TvltModel(metaclass=DummyObject):
+class TvpModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
-class TvltPreTrainedModel(metaclass=DummyObject):
+class TvpPreTrainedModel(metaclass=DummyObject):
     _backends = ["torch"]
 
     def __init__(self, *args, **kwargs):
@@ -7961,6 +8206,16 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+UNIVNET_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class UnivNetModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class UperNetForSemanticSegmentation(metaclass=DummyObject):
     _backends = ["torch"]
 
@@ -8065,6 +8320,23 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["torch"])
 
 
+VIPLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class VipLlavaForConditionalGeneration(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
+class VipLlavaPreTrainedModel(metaclass=DummyObject):
+    _backends = ["torch"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch"])
+
+
 class VisionEncoderDecoderModel(metaclass=DummyObject):
     _backends = ["torch"]
 
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 5bc238f5427850..2099c18bcd71c5 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -1075,6 +1075,51 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
+TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFAdaptiveEmbedding(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTransfoXLForSequenceClassification(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTransfoXLLMHeadModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTransfoXLMainLayer(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTransfoXLModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
+class TFTransfoXLPreTrainedModel(metaclass=DummyObject):
+    _backends = ["tf"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["tf"])
+
+
 TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_LIST = None
 
 
@@ -2613,51 +2658,6 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["tf"])
 
 
-TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
-
-
-class TFAdaptiveEmbedding(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTransfoXLForSequenceClassification(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTransfoXLLMHeadModel(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTransfoXLMainLayer(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTransfoXLModel(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
-class TFTransfoXLPreTrainedModel(metaclass=DummyObject):
-    _backends = ["tf"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["tf"])
-
-
 class TFVisionEncoderDecoderModel(metaclass=DummyObject):
     _backends = ["tf"]
 
diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py
index c4a3c9312ac8a7..f1a10ff5710ad0 100644
--- a/src/transformers/utils/dummy_vision_objects.py
+++ b/src/transformers/utils/dummy_vision_objects.py
@@ -485,6 +485,13 @@ def __init__(self, *args, **kwargs):
         requires_backends(self, ["vision"])
 
 
+class TvpImageProcessor(metaclass=DummyObject):
+    _backends = ["vision"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["vision"])
+
+
 class VideoMAEFeatureExtractor(metaclass=DummyObject):
     _backends = ["vision"]
 
diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py
index 50320dabb704c6..1559da0e53c68a 100755
--- a/src/transformers/utils/fx.py
+++ b/src/transformers/utils/fx.py
@@ -122,6 +122,7 @@ def _generate_supported_model_class_names(
     "convnext",
     "deberta",
     "deberta-v2",
+    "dinov2",
     "distilbert",
     "donut-swin",
     "electra",
diff --git a/src/transformers/utils/hub.py b/src/transformers/utils/hub.py
index f6cf0a852ed752..6ab1670ea37cf3 100644
--- a/src/transformers/utils/hub.py
+++ b/src/transformers/utils/hub.py
@@ -31,13 +31,16 @@
 import huggingface_hub
 import requests
 from huggingface_hub import (
+    _CACHED_NO_EXIST,
     CommitOperationAdd,
+    constants,
     create_branch,
     create_commit,
     create_repo,
     get_hf_file_metadata,
     hf_hub_download,
     hf_hub_url,
+    try_to_load_from_cache,
 )
 from huggingface_hub.file_download import REGEX_COMMIT_HASH, http_get
 from huggingface_hub.utils import (
@@ -49,7 +52,9 @@
     RevisionNotFoundError,
     build_hf_headers,
     hf_raise_for_status,
+    send_telemetry,
 )
+from huggingface_hub.utils._deprecation import _deprecate_method
 from requests.exceptions import HTTPError
 
 from . import __version__, logging
@@ -75,38 +80,56 @@ def is_offline_mode():
 
 
 torch_cache_home = os.getenv("TORCH_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "torch"))
+default_cache_path = constants.default_cache_path
 old_default_cache_path = os.path.join(torch_cache_home, "transformers")
-# New default cache, shared with the Datasets library
-hf_cache_home = os.path.expanduser(
-    os.getenv("HF_HOME", os.path.join(os.getenv("XDG_CACHE_HOME", "~/.cache"), "huggingface"))
-)
-default_cache_path = os.path.join(hf_cache_home, "hub")
+
+# Determine default cache directory. Lots of legacy environment variables to ensure backward compatibility.
+# The best way to set the cache path is with the environment variable HF_HOME. For more details, checkout this
+# documentation page: https://huggingface.co/docs/huggingface_hub/package_reference/environment_variables.
+#
+# In code, use `HF_HUB_CACHE` as the default cache path. This variable is set by the library and is guaranteed
+# to be set to the right value.
+#
+# TODO: clean this for v5?
+PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", constants.HF_HUB_CACHE)
+PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
+TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", PYTORCH_TRANSFORMERS_CACHE)
 
 # Onetime move from the old location to the new one if no ENV variable has been set.
 if (
     os.path.isdir(old_default_cache_path)
-    and not os.path.isdir(default_cache_path)
+    and not os.path.isdir(constants.HF_HUB_CACHE)
     and "PYTORCH_PRETRAINED_BERT_CACHE" not in os.environ
     and "PYTORCH_TRANSFORMERS_CACHE" not in os.environ
     and "TRANSFORMERS_CACHE" not in os.environ
 ):
     logger.warning(
-        "In Transformers v4.0.0, the default path to cache downloaded models changed from"
-        " '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have"
+        "In Transformers v4.22.0, the default path to cache downloaded models changed from"
+        " '~/.cache/torch/transformers' to '~/.cache/huggingface/hub'. Since you don't seem to have"
         " overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to"
-        " '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should"
+        " '~/.cache/huggingface/hub' to avoid redownloading models you have already in the cache. You should"
         " only see this message once."
     )
-    shutil.move(old_default_cache_path, default_cache_path)
+    shutil.move(old_default_cache_path, constants.HF_HUB_CACHE)
 
-PYTORCH_PRETRAINED_BERT_CACHE = os.getenv("PYTORCH_PRETRAINED_BERT_CACHE", default_cache_path)
-PYTORCH_TRANSFORMERS_CACHE = os.getenv("PYTORCH_TRANSFORMERS_CACHE", PYTORCH_PRETRAINED_BERT_CACHE)
-HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", PYTORCH_TRANSFORMERS_CACHE)
-TRANSFORMERS_CACHE = os.getenv("TRANSFORMERS_CACHE", HUGGINGFACE_HUB_CACHE)
-HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(hf_cache_home, "modules"))
+HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(constants.HF_HOME, "modules"))
 TRANSFORMERS_DYNAMIC_MODULE_NAME = "transformers_modules"
 SESSION_ID = uuid4().hex
-DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", False) in ENV_VARS_TRUE_VALUES
+DISABLE_TELEMETRY = os.getenv("DISABLE_TELEMETRY", constants.HF_HUB_DISABLE_TELEMETRY) in ENV_VARS_TRUE_VALUES
+
+# Add deprecation warning for old environment variables.
+for key in ("PYTORCH_PRETRAINED_BERT_CACHE", "PYTORCH_TRANSFORMERS_CACHE", "TRANSFORMERS_CACHE"):
+    if os.getenv(key) is not None:
+        warnings.warn(
+            f"Using `{key}` is deprecated and will be removed in v5 of Transformers. Use `HF_HOME` instead.",
+            FutureWarning,
+        )
+if os.getenv("DISABLE_TELEMETRY") is not None:
+    warnings.warn(
+        "Using `DISABLE_TELEMETRY` is deprecated and will be removed in v5 of Transformers. Use `HF_HUB_DISABLE_TELEMETRY` instead.",
+        FutureWarning,
+    )
+
 
 S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
 CLOUDFRONT_DISTRIB_PREFIX = "https://cdn.huggingface.co"
@@ -126,15 +149,16 @@ def is_offline_mode():
 HUGGINGFACE_CO_PREFIX = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/{model_id}/resolve/{revision}/{filename}"
 HUGGINGFACE_CO_EXAMPLES_TELEMETRY = HUGGINGFACE_CO_RESOLVE_ENDPOINT + "/api/telemetry/examples"
 
-# Return value when trying to load a file from cache but the file does not exist in the distant repo.
-_CACHED_NO_EXIST = object()
-
 
 def is_remote_url(url_or_filename):
     parsed = urlparse(url_or_filename)
     return parsed.scheme in ("http", "https")
 
 
+# TODO: remove this once fully deprecated
+# TODO? remove from './examples/research_projects/lxmert/utils.py' as well
+# TODO? remove from './examples/research_projects/visual_bert/utils.py' as well
+@_deprecate_method(version="4.39.0", message="This method is outdated and does not support the new cache system.")
 def get_cached_models(cache_dir: Union[str, Path] = None) -> List[Tuple]:
     """
     Returns a list of tuples representing model binaries that are cached locally. Each tuple has shape `(model_url,
@@ -219,7 +243,7 @@ def http_user_agent(user_agent: Union[Dict, str, None] = None) -> str:
     return ua
 
 
-def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]):
+def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]) -> Optional[str]:
     """
     Extracts the commit hash from a resolved filename toward a cache file.
     """
@@ -233,73 +257,6 @@ def extract_commit_hash(resolved_file: Optional[str], commit_hash: Optional[str]
     return commit_hash if REGEX_COMMIT_HASH.match(commit_hash) else None
 
 
-def try_to_load_from_cache(
-    repo_id: str,
-    filename: str,
-    cache_dir: Union[str, Path, None] = None,
-    revision: Optional[str] = None,
-    repo_type: Optional[str] = None,
-) -> Optional[str]:
-    """
-    Explores the cache to return the latest cached file for a given revision if found.
-
-    This function will not raise any exception if the file in not cached.
-
-    Args:
-        cache_dir (`str` or `os.PathLike`):
-            The folder where the cached files lie.
-        repo_id (`str`):
-            The ID of the repo on huggingface.co.
-        filename (`str`):
-            The filename to look for inside `repo_id`.
-        revision (`str`, *optional*):
-            The specific model version to use. Will default to `"main"` if it's not provided and no `commit_hash` is
-            provided either.
-        repo_type (`str`, *optional*):
-            The type of the repo.
-
-    Returns:
-        `Optional[str]` or `_CACHED_NO_EXIST`:
-            Will return `None` if the file was not cached. Otherwise:
-            - The exact path to the cached file if it's found in the cache
-            - A special value `_CACHED_NO_EXIST` if the file does not exist at the given commit hash and this fact was
-              cached.
-    """
-    if revision is None:
-        revision = "main"
-
-    if cache_dir is None:
-        cache_dir = TRANSFORMERS_CACHE
-
-    object_id = repo_id.replace("/", "--")
-    if repo_type is None:
-        repo_type = "model"
-    repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
-    if not os.path.isdir(repo_cache):
-        # No cache for this model
-        return None
-    for subfolder in ["refs", "snapshots"]:
-        if not os.path.isdir(os.path.join(repo_cache, subfolder)):
-            return None
-
-    # Resolve refs (for instance to convert main to the associated commit sha)
-    cached_refs = os.listdir(os.path.join(repo_cache, "refs"))
-    if revision in cached_refs:
-        with open(os.path.join(repo_cache, "refs", revision)) as f:
-            revision = f.read()
-
-    if os.path.isfile(os.path.join(repo_cache, ".no_exist", revision, filename)):
-        return _CACHED_NO_EXIST
-
-    cached_shas = os.listdir(os.path.join(repo_cache, "snapshots"))
-    if revision not in cached_shas:
-        # No cache for this revision and we won't try to return a random revision
-        return None
-
-    cached_file = os.path.join(repo_cache, "snapshots", revision, filename)
-    return cached_file if os.path.isfile(cached_file) else None
-
-
 def cached_file(
     path_or_repo_id: Union[str, os.PathLike],
     filename: str,
@@ -317,7 +274,7 @@ def cached_file(
     _raise_exceptions_for_connection_errors: bool = True,
     _commit_hash: Optional[str] = None,
     **deprecated_kwargs,
-):
+) -> Optional[str]:
     """
     Tries to locate a file in a local folder and repo, downloads and cache it if necessary.
 
@@ -369,7 +326,8 @@ def cached_file(
     ```python
     # Download a model weight from the Hub and cache it.
     model_weights_file = cached_file("bert-base-uncased", "pytorch_model.bin")
-    ```"""
+    ```
+    """
     use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
     if use_auth_token is not None:
         warnings.warn(
@@ -499,6 +457,10 @@ def cached_file(
     return resolved_file
 
 
+# TODO: deprecate `get_file_from_repo` or document it differently?
+#       Docstring is exactly the same as `cached_repo` but behavior is slightly different. If file is missing or if
+#       there is a connection error, `cached_repo` will return None while `get_file_from_repo` will raise an error.
+#       IMO we should keep only 1 method and have a single `raise_error` argument (to be discussed).
 def get_file_from_repo(
     path_or_repo: Union[str, os.PathLike],
     filename: str,
@@ -564,7 +526,8 @@ def get_file_from_repo(
     tokenizer_config = get_file_from_repo("bert-base-uncased", "tokenizer_config.json")
     # This model does not have a tokenizer config so the result will be None.
     tokenizer_config = get_file_from_repo("xlm-roberta-base", "tokenizer_config.json")
-    ```"""
+    ```
+    """
     use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
     if use_auth_token is not None:
         warnings.warn(
@@ -609,10 +572,11 @@ def download_url(url, proxies=None):
         f"Using `from_pretrained` with the url of a file (here {url}) is deprecated and won't be possible anymore in"
         " v5 of Transformers. You should host your file on the Hub (hf.co) instead and use the repository ID. Note"
         " that this is not compatible with the caching system (your file will be downloaded at each execution) or"
-        " multiple processes (each process will download the file in a different temporary file)."
+        " multiple processes (each process will download the file in a different temporary file).",
+        FutureWarning,
     )
-    tmp_file = tempfile.mkstemp()[1]
-    with open(tmp_file, "wb") as f:
+    tmp_fd, tmp_file = tempfile.mkstemp()
+    with os.fdopen(tmp_fd, "wb") as f:
         http_get(url, f, proxies=proxies)
     return tmp_file
 
@@ -947,13 +911,10 @@ def send_example_telemetry(example_name, *example_args, framework="pytorch"):
             script_name = script_name.replace("_no_trainer", "")
             data["dataset_name"] = f"{script_name}-{args_as_dict['task_name']}"
 
-    headers = {"user-agent": http_user_agent(data)}
-    try:
-        r = requests.head(HUGGINGFACE_CO_EXAMPLES_TELEMETRY, headers=headers)
-        r.raise_for_status()
-    except Exception:
-        # We don't want to error in case of connection errors of any kind.
-        pass
+    # Send telemetry in the background
+    send_telemetry(
+        topic="examples", library_name="transformers", library_version=__version__, user_agent=http_user_agent(data)
+    )
 
 
 def convert_file_size_to_int(size: Union[int, str]):
@@ -1165,7 +1126,7 @@ def move_cache(cache_dir=None, new_cache_dir=None, token=None):
     if new_cache_dir is None:
         new_cache_dir = TRANSFORMERS_CACHE
     if cache_dir is None:
-        # Migrate from old cache in .cache/huggingface/hub
+        # Migrate from old cache in .cache/huggingface/transformers
         old_cache = Path(TRANSFORMERS_CACHE).parent / "transformers"
         if os.path.isdir(str(old_cache)):
             cache_dir = str(old_cache)
@@ -1258,7 +1219,7 @@ def cancel(self) -> None:
             "`transformers.utils.move_cache()`."
         )
     try:
-        if TRANSFORMERS_CACHE != default_cache_path:
+        if TRANSFORMERS_CACHE != constants.HF_HUB_CACHE:
             # Users set some env variable to customize cache storage
             move_cache(TRANSFORMERS_CACHE, TRANSFORMERS_CACHE)
         else:
diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py
index c4862b197c97e7..9f3b6865a9b540 100644
--- a/src/transformers/utils/import_utils.py
+++ b/src/transformers/utils/import_utils.py
@@ -67,13 +67,13 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[
 # This is the version of torch required to run torch.fx features and torch.onnx with dictionary inputs.
 TORCH_FX_REQUIRED_VERSION = version.parse("1.10")
 
+ACCELERATE_MIN_VERSION = "0.21.0"
+FSDP_MIN_VERSION = "1.12.0"
+
 
 _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True)
 _apex_available = _is_package_available("apex")
 _bitsandbytes_available = _is_package_available("bitsandbytes")
-_flash_attn_2_available = _is_package_available("flash_attn") and version.parse(
-    importlib.metadata.version("flash_attn")
-) >= version.parse("2.1.0")
 # `importlib.metadata.version` doesn't work with `bs4` but `beautifulsoup4`. For `importlib.util.find_spec`, reversed.
 _bs4_available = importlib.util.find_spec("bs4") is not None
 _coloredlogs_available = _is_package_available("coloredlogs")
@@ -261,6 +261,19 @@ def get_torch_version():
     return _torch_version
 
 
+def is_torch_sdpa_available():
+    if not is_torch_available():
+        return False
+    elif _torch_version == "N/A":
+        return False
+
+    # NOTE: We require torch>=2.1 (and not torch>=2.0) to use SDPA in Transformers for two reasons:
+    # - Allow the global use of the `scale` argument introduced in https://github.com/pytorch/pytorch/pull/95259
+    # - Memory-efficient attention supports arbitrary attention_mask: https://github.com/pytorch/pytorch/pull/104310
+    # NOTE: We require torch>=2.1.1 to avoid a numerical issue in SDPA with non-contiguous inputs: https://github.com/pytorch/pytorch/issues/112577
+    return version.parse(_torch_version) >= version.parse("2.1.1")
+
+
 def is_torchvision_available():
     return _torchvision_available
 
@@ -608,10 +621,29 @@ def is_flash_attn_2_available():
     if not is_torch_available():
         return False
 
+    if not _is_package_available("flash_attn"):
+        return False
+
     # Let's add an extra check to see if cuda is available
     import torch
 
-    return _flash_attn_2_available and torch.cuda.is_available()
+    if not torch.cuda.is_available():
+        return False
+
+    if torch.version.cuda:
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
+    elif torch.version.hip:
+        # TODO: Bump the requirement to 2.1.0 once released in https://github.com/ROCmSoftwarePlatform/flash-attention
+        return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.0.4")
+    else:
+        return False
+
+
+def is_flash_attn_greater_or_equal_2_10():
+    if not _is_package_available("flash_attn"):
+        return False
+
+    return version.parse(importlib.metadata.version("flash_attn")) >= version.parse("2.1.0")
 
 
 def is_flash_attn_available():
@@ -652,13 +684,13 @@ def is_protobuf_available():
     return importlib.util.find_spec("google.protobuf") is not None
 
 
-def is_accelerate_available(min_version: str = None):
+def is_accelerate_available(min_version: str = ACCELERATE_MIN_VERSION):
     if min_version is not None:
         return _accelerate_available and version.parse(_accelerate_version) >= version.parse(min_version)
     return _accelerate_available
 
 
-def is_fsdp_available(min_version: str = "1.12.0"):
+def is_fsdp_available(min_version: str = FSDP_MIN_VERSION):
     return is_torch_available() and version.parse(_torch_version) >= version.parse(min_version)
 
 
@@ -1125,8 +1157,9 @@ def is_jinja_available():
 
 # docstyle-ignore
 ACCELERATE_IMPORT_ERROR = """
-{0} requires the accelerate library but it was not found in your environment. You can install it with pip:
-`pip install accelerate`. Please note that you may need to restart your runtime after installation.
+{0} requires the accelerate library >= {ACCELERATE_MIN_VERSION} it was not found in your environment.
+You can install or update it with pip: `pip install --upgrade accelerate`. Please note that you may need to restart your
+runtime after installation.
 """
 
 # docstyle-ignore
diff --git a/src/transformers/utils/notebook.py b/src/transformers/utils/notebook.py
index c97849d02dbe89..f7396642732e58 100644
--- a/src/transformers/utils/notebook.py
+++ b/src/transformers/utils/notebook.py
@@ -161,7 +161,7 @@ def update(self, value: int, force_update: bool = False, comment: str = None):
             self.update_bar(value)
             self.last_value = value
             self.last_time = current_time
-            if self.average_time_per_item is None:
+            if (self.average_time_per_item is None) or (self.average_time_per_item == 0):
                 self.wait_for = 1
             else:
                 self.wait_for = max(int(self.update_every / self.average_time_per_item), 1)
@@ -177,7 +177,11 @@ def update_bar(self, value, comment=None):
                 f"[{spaced_value}/{self.total} {format_time(self.elapsed_time)} <"
                 f" {format_time(self.predicted_remaining)}"
             )
-            self.label += f", {1/self.average_time_per_item:.2f} it/s"
+            if self.average_time_per_item == 0:
+                self.label += ", +inf it/s"
+            else:
+                self.label += f", {1/self.average_time_per_item:.2f} it/s"
+
         self.label += "]" if self.comment is None or len(self.comment) == 0 else f", {self.comment}]"
         self.display()
 
@@ -367,6 +371,8 @@ def on_evaluate(self, args, state, control, metrics=None, **kwargs):
 
     def on_train_end(self, args, state, control, **kwargs):
         self.training_tracker.update(
-            state.global_step, comment=f"Epoch {int(state.epoch)}/{state.num_train_epochs}", force_update=True
+            state.global_step,
+            comment=f"Epoch {int(state.epoch)}/{state.num_train_epochs}",
+            force_update=True,
         )
         self.training_tracker = None
diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py
index 222ba68a6dc1e4..4f268ab6bc7102 100644
--- a/src/transformers/utils/quantization_config.py
+++ b/src/transformers/utils/quantization_config.py
@@ -24,7 +24,7 @@
 
 from packaging import version
 
-from ..utils import is_torch_available, logging
+from ..utils import is_auto_awq_available, is_torch_available, logging
 
 
 if is_torch_available():
@@ -543,6 +543,12 @@ class AwqConfig(QuantizationConfigMixin):
         backend (`AwqBackendPackingMethod`, *optional*, defaults to `AwqBackendPackingMethod.AUTOAWQ`):
             The quantization backend. Some models might be quantized using `llm-awq` backend. This is useful for users
             that quantize their own models using `llm-awq` library.
+        do_fuse (`bool`, *optional*, defaults to `False`):
+            Whether to fuse attention and mlp layers together for faster inference
+        fuse_max_seq_len (`int`, *optional*):
+            The Maximum sequence length to generate when using fusing.
+        modules_to_fuse (`dict`, *optional*, default to `None`):
+            Overwrite the natively supported fusing scheme with the one specified by the users.
     """
 
     def __init__(
@@ -552,6 +558,9 @@ def __init__(
         zero_point: bool = True,
         version: AWQLinearVersion = AWQLinearVersion.GEMM,
         backend: AwqBackendPackingMethod = AwqBackendPackingMethod.AUTOAWQ,
+        do_fuse: Optional[bool] = None,
+        fuse_max_seq_len: Optional[int] = None,
+        modules_to_fuse: Optional[dict] = None,
         **kwargs,
     ):
         self.quant_method = QuantizationMethod.AWQ
@@ -561,6 +570,14 @@ def __init__(
         self.zero_point = zero_point
         self.version = version
         self.backend = backend
+        self.fuse_max_seq_len = fuse_max_seq_len
+
+        self.modules_to_fuse = modules_to_fuse
+        if do_fuse is None:
+            self.do_fuse = modules_to_fuse is not None and len(modules_to_fuse) > 0
+        else:
+            self.do_fuse = do_fuse
+        self.fuse_max_seq_len = fuse_max_seq_len
 
         self.post_init()
 
@@ -587,3 +604,42 @@ def post_init(self):
             major, minor = compute_capability
             if major < 8:
                 raise ValueError("LLM-AWQ backend is only supported on GPUs with compute capability >= 8.0")
+
+        if self.do_fuse and self.fuse_max_seq_len is None:
+            raise ValueError(
+                "You cannot enable fused modules without specifying a `fuse_max_seq_len`, make sure to pass a valid `fuse_max_seq_len` for your usecase"
+            )
+
+        if self.do_fuse:
+            awq_version_supports_fusing = False
+            MIN_AWQ_VERSION = "0.1.7"
+            if is_auto_awq_available():
+                awq_version_supports_fusing = version.parse(importlib.metadata.version("autoawq")) >= version.parse(
+                    MIN_AWQ_VERSION
+                )
+
+            if not awq_version_supports_fusing:
+                raise ValueError(
+                    f"You current version of `autoawq` does not support module fusing, please upgrade `autoawq` package to at least {MIN_AWQ_VERSION}."
+                )
+
+        if self.do_fuse and self.modules_to_fuse is not None:
+            required_keys = [
+                "hidden_size",
+                "num_attention_heads",
+                "num_key_value_heads",
+                "mlp",
+                "attention",
+                "layernorm",
+                "use_alibi",
+            ]
+            if not all(key in self.modules_to_fuse for key in required_keys):
+                raise ValueError(
+                    f"Required fields are missing in the fusing mapping, required fields are {required_keys}"
+                )
+
+    def get_loading_attributes(self):
+        attibutes_dict = copy.deepcopy(self.__dict__)
+        loading_attibutes = ["do_fuse", "modules_to_fuse", "fuse_max_seq_len"]
+        loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes}
+        return loading_attibutes_dict
diff --git a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
index acdfe49090e8bd..f01283ae08fc7c 100755
--- a/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
+++ b/templates/adding_a_new_example_script/{{cookiecutter.directory_name}}/run_{{cookiecutter.example_shortcut}}.py
@@ -290,7 +290,7 @@ def main():
             extension = "text"
         raw_datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
@@ -735,7 +735,7 @@ def main():
         extension = args.train_file.split(".")[-1]
         raw_datasets = load_dataset(extension, data_files=data_files)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Load pretrained model and tokenizer
     #
diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py
index 2352cf522f29a7..14c8f6703166c9 100644
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -561,8 +561,8 @@ def test_gradient_accumulation(self, stage, dtype):
         self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
         self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)
 
-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+        # Relative difference. See the note above how to get identical loss on a small bs
+        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)
 
     def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
         # adapted from TrainerIntegrationCommon.check_saved_checkpoints
diff --git a/tests/generation/test_configuration_utils.py b/tests/generation/test_configuration_utils.py
index a181b00ee08d2c..e5eb1bb34cc0dc 100644
--- a/tests/generation/test_configuration_utils.py
+++ b/tests/generation/test_configuration_utils.py
@@ -120,6 +120,34 @@ def test_kwarg_init(self):
         self.assertEqual(loaded_config.do_sample, True)
         self.assertEqual(loaded_config.num_beams, 1)  # default value
 
+    def test_validate(self):
+        """
+        Tests that the `validate` method is working as expected. Note that `validate` is called at initialization time
+        """
+        # Case 1: A correct configuration will not throw any warning
+        with warnings.catch_warnings(record=True) as captured_warnings:
+            GenerationConfig()
+        self.assertEqual(len(captured_warnings), 0)
+
+        # Case 2: Inconsequent but technically wrong configuration will throw a warning (e.g. setting sampling
+        # parameters with `do_sample=False`). May be escalated to an error in the future.
+        with warnings.catch_warnings(record=True) as captured_warnings:
+            GenerationConfig(temperature=0.5)
+        self.assertEqual(len(captured_warnings), 1)
+
+        # Case 3: Impossible sets of contraints/parameters will raise an exception
+        with self.assertRaises(ValueError):
+            GenerationConfig(num_return_sequences=2)
+
+        # Case 4: Passing `generate()`-only flags to `validate` will raise an exception
+        with self.assertRaises(ValueError):
+            GenerationConfig(logits_processor="foo")
+
+        # Case 5: Model-specific parameters will NOT raise an exception or a warning
+        with warnings.catch_warnings(record=True) as captured_warnings:
+            GenerationConfig(foo="bar")
+        self.assertEqual(len(captured_warnings), 0)
+
     def test_refuse_to_save(self):
         """Tests that we refuse to save a generation config that fails validation."""
 
diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py
index 8a269801640ebf..306cb15168e5be 100644
--- a/tests/generation/test_framework_agnostic.py
+++ b/tests/generation/test_framework_agnostic.py
@@ -633,11 +633,7 @@ def test_eos_token_id_int_and_list_beam_search(self):
             "do_sample": False,
             "num_beams": 3,
         }
-        if is_pt:
-            expectation = 20
-        else:
-            # TODO (joao): fix me
-            expectation = 13
+        expectation = 13
 
         tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2")
         text = """Hello, my dog is cute and"""
diff --git a/tests/generation/test_logits_process.py b/tests/generation/test_logits_process.py
index 15f5cf1e4f4600..b1b3602c927dba 100644
--- a/tests/generation/test_logits_process.py
+++ b/tests/generation/test_logits_process.py
@@ -610,6 +610,13 @@ def prefix_allowed_tokens_fn(batch_id, inputs_ids):
             torch.isinf(filtered_scores).tolist(), [[False, False, True, True, True], [True, True, False, False, True]]
         )
 
+        def empty_prefix_allowed_tokens_fn(batch_id, inputs_ids):
+            return []
+
+        prefix_constrained_logits_proc = PrefixConstrainedLogitsProcessor(empty_prefix_allowed_tokens_fn, 1)
+
+        self.assertRaises(ValueError, prefix_constrained_logits_proc, input_ids, scores.clone())
+
     def test_hamming_diversity(self):
         vocab_size = 4
         num_beams = 2
@@ -692,7 +699,7 @@ def test_remove_nan_inf_logits_processor(self):
             torch.allclose(
                 scores,
                 torch.tensor(
-                    [[0.0, 0.7, 0.8, 0.0], [0.1, torch.finfo(scores.dtype).max, 0.3, float("-inf")]],
+                    [[0.0, 0.7, 0.8, 0.0], [0.1, torch.finfo(scores.dtype).max, 0.3, torch.finfo(scores.dtype).min]],
                     device=torch_device,
                 ),
                 atol=1e-6,
diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py
index 1705142f8f1fb3..6e11818f69a134 100644
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -20,9 +20,11 @@
 import warnings
 
 import numpy as np
+from parameterized import parameterized
 
-from transformers import is_torch_available, pipeline
+from transformers import is_torch_available, pipeline, set_seed
 from transformers.testing_utils import (
+    is_flaky,
     require_accelerate,
     require_torch,
     require_torch_multi_accelerator,
@@ -52,6 +54,7 @@
         SpeechEncoderDecoderModel,
         top_k_top_p_filtering,
     )
+    from transformers.cache_utils import DynamicCache
     from transformers.generation import (
         BeamSampleDecoderOnlyOutput,
         BeamSampleEncoderDecoderOutput,
@@ -103,11 +106,7 @@ def _get_input_ids_and_config(self, batch_size=2):
             if isinstance(config.eos_token_id, int):
                 config.eos_token_id = [config.eos_token_id]
             config.pad_token_id = config.eos_token_id[0]
-        # TransfoXL has no attention mask
-        if "transfoxl" in config.__class__.__name__.lower():
-            attention_mask = None
-        else:
-            attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :sequence_length]
+        attention_mask = torch.ones_like(input_ids, dtype=torch.long)[:batch_size, :sequence_length]
 
         return config, input_ids, attention_mask, max_length
 
@@ -123,9 +122,14 @@ def _get_logits_processor_and_kwargs(
         process_kwargs = {
             "min_length": input_length + 1 if max_length is None else max_length - 1,
             "bad_words_ids": [[1, 0]],
-            "no_repeat_ngram_size": 2,
             "repetition_penalty": 1.2,
+            "remove_invalid_values": True,
         }
+        # NoRepeatNGramLogitsProcessor + forced tokens may result in no valid continuations
+        if forced_bos_token_id is None and forced_eos_token_id is None:
+            process_kwargs["no_repeat_ngram_size"] = 2
+
+        # NOTE: the order of operations here should match `generate` for accurate testing
         logits_processor = LogitsProcessorList(
             (
                 [
@@ -153,12 +157,16 @@ def _get_logits_processor_and_kwargs(
                 if forced_eos_token_id is not None
                 else []
             )
-            + [
-                NoBadWordsLogitsProcessor(process_kwargs["bad_words_ids"], eos_token_id),
-                NoRepeatNGramLogitsProcessor(process_kwargs["no_repeat_ngram_size"]),
-                RepetitionPenaltyLogitsProcessor(process_kwargs["repetition_penalty"]),
-            ]
+            + [NoBadWordsLogitsProcessor(process_kwargs["bad_words_ids"], eos_token_id)]
+            + (
+                [NoRepeatNGramLogitsProcessor(process_kwargs["no_repeat_ngram_size"])]
+                if forced_bos_token_id is None and forced_eos_token_id is None
+                else []
+            )
+            + [RepetitionPenaltyLogitsProcessor(process_kwargs["repetition_penalty"])]
+            + [InfNanRemoveLogitsProcessor()]  # prevent flaky generation test failures
         )
+
         return process_kwargs, logits_processor
 
     @staticmethod
@@ -281,7 +289,6 @@ def _greedy_generate(
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **logits_process_kwargs,
             **model_kwargs,
         )
@@ -339,7 +346,6 @@ def _sample_generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **logits_warper_kwargs,
             **process_kwargs,
             **model_kwargs,
@@ -360,9 +366,6 @@ def _sample_generate(
         elif attention_mask is not None:
             attention_mask = attention_mask.repeat_interleave(num_return_sequences, dim=0)
 
-        # prevent flaky generation test failures
-        logits_processor.append(InfNanRemoveLogitsProcessor())
-
         with torch.no_grad():
             model_kwargs = {"attention_mask": attention_mask} if attention_mask is not None else {}
             output_sample = model.sample(
@@ -404,7 +407,6 @@ def _beam_search_generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **beam_kwargs,
             **logits_process_kwargs,
             **model_kwargs,
@@ -466,7 +468,6 @@ def _beam_sample_generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **beam_kwargs,
             **logits_warper_kwargs,
             **model_kwargs,
@@ -533,7 +534,6 @@ def _group_beam_search_generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **beam_kwargs,
             **logits_process_kwargs,
             **model_kwargs,
@@ -595,7 +595,6 @@ def _constrained_beam_search_generate(
             output_attentions=output_attentions,
             output_hidden_states=output_hidden_states,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             constraints=constraints,
             **beam_kwargs,
             **logits_process_kwargs,
@@ -670,7 +669,6 @@ def _contrastive_generate(
             output_hidden_states=output_hidden_states,
             output_scores=output_scores,
             return_dict_in_generate=return_dict_in_generate,
-            remove_invalid_values=True,
             **logits_process_kwargs,
             **model_kwargs,
             **contrastive_search_kwargs,
@@ -1283,13 +1281,8 @@ def test_constrained_beam_search_generate(self):
 
             # check `generate()` and `constrained_beam_search()` are equal
             # Sample constraints
-            if not input_ids.dtype == torch.float32:
-                min_id = torch.min(input_ids) + 3
-                max_id = torch.max(input_ids)
-            else:
-                # otherwise this throws an error for Speech2TextModel since its inputs are floating points
-                min_id = 3
-                max_id = 100
+            min_id = 3
+            max_id = config.vocab_size
 
             force_tokens = torch.randint(min_id, max_id, (1, 2)).tolist()[0]
             constraints = [
@@ -1506,10 +1499,14 @@ def test_contrastive_generate_low_memory(self):
             )
             self.assertListEqual(low_output.tolist(), high_output.tolist())
 
-    @slow  # TODO(Joao): remove this. Some models (e.g. data2vec, xcom, roberta) have an error rate between 1 and 10%.
+    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
     def test_assisted_decoding_matches_greedy_search(self):
         # This test ensures that the assisted generation does not introduce output changes over greedy search.
-        # It breaks the pattern in the tests above, for multiple reasons:
+        # NOTE (1): The sentence above is true most of the time, there is a tiny difference in the logits due to matmul
+        # shape differences -- and it may result in a different output. The input shape difference happens in the
+        # main model, that runs the forward pass with several candidates at once (as opposed to generating one token at
+        # a time). See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535 for more info.
+        # NOTE (2): It breaks the pattern in the tests above, for multiple reasons:
         # - assisted_decoding, contrarily to the other methods, can't be called on its own (e.g. needs to
         # prepare the assistant encoder outputs in the main generate body);
         # - assisted_decoding does not support `use_cache = False`
@@ -1520,77 +1517,82 @@ def test_assisted_decoding_matches_greedy_search(self):
                 self.skipTest("Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
-                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet"]
+                for model_name in [
+                    "bigbirdpegasus",
+                    "led",
+                    "mega",
+                    "speech2text",
+                    "git",
+                    "prophetnet",
+                    "seamlessm4t",
+                    "clvp",
+                ]
             ):
                 self.skipTest("May fix in the future: need model-specific fixes")
 
-            # This for loop is a naive and temporary effort to make the test less flaky.
-            failed = 0
-            for i in range(10):
-                # enable cache
-                config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
-
-                # NOTE: assisted generation only works with cache on at the moment.
-                if not hasattr(config, "use_cache"):
-                    self.skipTest("This model doesn't support caching")
+            # enable cache
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config(batch_size=1)
 
-                config.use_cache = True
-                config.is_decoder = True
-                model = model_class(config).to(torch_device).eval()
-                output_greedy = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    max_length=max_length,
-                    num_beams=1,
-                    do_sample=False,
-                    output_scores=True,
-                    output_hidden_states=True,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                )
-                # Note: with assisted generate, if the same model is used as assistant, then all assistant tokens will
-                # be correct
-                output_assisted = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    max_length=max_length,
-                    num_beams=1,
-                    do_sample=False,
-                    assistant_model=model,
-                    output_scores=True,
-                    output_hidden_states=True,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                )
+            # NOTE: assisted generation only works with cache on at the moment.
+            if not hasattr(config, "use_cache"):
+                self.skipTest("This model doesn't support caching")
 
-                try:
-                    self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+            config.use_cache = True
+            config.is_decoder = True
+            model = model_class(config).to(torch_device).eval()
+            # Sets assisted generation arguments such that:
+            # a) no EOS is generated, to ensure generation doesn't break early
+            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
+            #    the assistant model is correct
+            # c) there are at least two forward passes in the main model, to ensure the input preparation of
+            #    the main model is correct
+            generation_kwargs = {
+                "eos_token_id": -1,  # see a)
+                "max_new_tokens": 4,  # see c)
+                "num_beams": 1,
+                "do_sample": False,
+                "output_scores": True,
+                "output_hidden_states": True,
+                "output_attentions": True,
+                "return_dict_in_generate": True,
+            }
+            output_greedy = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
 
-                    for output in (output_greedy, output_assisted):
-                        self._check_outputs(output, input_ids, model.config, use_cache=True)
-                except AssertionError:
-                    failed += 1
-                    if failed > 1:
-                        self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+            assistant_model = model
+            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
+            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
+            generation_kwargs.update({"assistant_model": assistant_model})
+            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
 
-                        for output in (output_greedy, output_assisted):
-                            self._check_outputs(output, input_ids, model.config, use_cache=True)
+            # The two outputs must match and their shape must be as expected
+            self.assertListEqual(output_greedy.sequences.tolist(), output_assisted.sequences.tolist())
+            for output in (output_greedy, output_assisted):
+                self._check_outputs(output, input_ids, model.config, use_cache=True)
 
-    @unittest.skip("Failing for a lot of models du to attention mask size missmatch. Works well when standalone.")
     def test_assisted_decoding_sample(self):
-        # Seeded assisted decoding will not match sample for the same seed, as the forward pass does not return the
-        # exact same logits (the forward pass of the main model, now with several tokens at once, has causal masking).
+        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
+        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
+        # different shapes, see https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535).
         for model_class in self.all_generative_model_classes:
             if any(model_name in model_class.__name__.lower() for model_name in ["fsmt", "reformer"]):
                 self.skipTest("Won't fix: old model with different cache format")
             if any(
                 model_name in model_class.__name__.lower()
-                for model_name in ["bigbirdpegasus", "led", "mega", "speech2text", "git", "prophetnet", "seamlessm4t"]
+                for model_name in [
+                    "bigbirdpegasus",
+                    "led",
+                    "mega",
+                    "speech2text",
+                    "git",
+                    "prophetnet",
+                    "seamlessm4t",
+                    "clvp",
+                ]
             ):
                 self.skipTest("May fix in the future: need model-specific fixes")
 
             # enable cache
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config(batch_size=1)
 
             # NOTE: assisted generation only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
@@ -1599,18 +1601,27 @@ def test_assisted_decoding_sample(self):
             config.use_cache = True
             config.is_decoder = True
             model = model_class(config).to(torch_device).eval()
-            output_assisted = model.generate(
-                input_ids,
-                attention_mask=attention_mask,
-                max_length=max_length,
-                num_beams=1,
-                do_sample=True,
-                assistant_model=model,  # triggers assisted decoding
-                output_scores=True,
-                output_hidden_states=True,
-                output_attentions=True,
-                return_dict_in_generate=True,
-            )
+            # Sets assisted generation arguments such that:
+            # a) no EOS is generated, to ensure generation doesn't break early
+            # b) the assistant model always generates two tokens when it is called, to ensure the input preparation of
+            #    the assistant model is correct
+            # c) there are at least two forward passes in the main model, to ensure the input preparation of
+            #    the main model is correct
+            assistant_model = model
+            assistant_model.generation_config.num_assistant_tokens = 2  # see b)
+            assistant_model.generation_config.num_assistant_tokens_schedule = "constant"  # see b)
+            generation_kwargs = {
+                "eos_token_id": -1,  # see a)
+                "max_new_tokens": 4,  # see c)
+                "num_beams": 1,
+                "do_sample": True,
+                "assistant_model": assistant_model,
+                "output_scores": True,
+                "output_hidden_states": True,
+                "output_attentions": True,
+                "return_dict_in_generate": True,
+            }
+            output_assisted = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
 
             self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
 
@@ -1895,6 +1906,66 @@ def test_generate_continue_from_past_key_values(self):
                         )
                     )
 
+    @parameterized.expand([(1, False), (1, True), (4, False)])
+    def test_new_cache_format(self, num_beams, do_sample):
+        # Tests that generating with the new format is exactly the same as the legacy one (for models that support it).
+        # 👉 tests with and without beam search so that we can test with and without cache reordering.
+        # 👉 tests with and without sampling so we can cover the most common use cases.
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_cache_class:
+                self.skipTest("This model does not support the new cache format")
+
+            config, input_ids, attention_mask, _ = self._get_input_ids_and_config()
+            config.use_cache = True
+            config.is_decoder = True
+
+            model = model_class(config).to(torch_device).eval()
+            generation_kwargs = {
+                "max_new_tokens": 5,
+                "do_sample": do_sample,
+                "num_beams": num_beams,
+                "num_return_sequences": num_beams,
+                "return_dict_in_generate": True,  # Required to return `past_key_values`
+            }
+
+            # Sets seed before calling `generate` for the case with do_sample=True
+            seed = torch.randint(0, 1000000, (1,)).item()
+            set_seed(seed)
+            legacy_results = model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
+            set_seed(seed)
+            new_results = model.generate(
+                input_ids, attention_mask=attention_mask, past_key_values=DynamicCache(), **generation_kwargs
+            )
+
+            # The two sets of generated sequences must match, despite the cache format between forward passes being
+            # different
+            self.assertListEqual(legacy_results.sequences.tolist(), new_results.sequences.tolist())
+            self.assertTrue(isinstance(legacy_results.past_key_values, tuple))
+            self.assertTrue(isinstance(new_results.past_key_values, DynamicCache))
+
+            # The contents of the two caches, when converted to the same format (in both directions!), must match
+            legacy_cache = legacy_results.past_key_values
+            new_cache_converted = new_results.past_key_values.to_legacy_cache()
+            for layer_idx in range(len(legacy_cache)):
+                for kv_idx in range(len(legacy_cache[layer_idx])):
+                    self.assertTrue(
+                        torch.allclose(
+                            legacy_cache[layer_idx][kv_idx],
+                            new_cache_converted[layer_idx][kv_idx],
+                        )
+                    )
+
+            new_cache = new_results.past_key_values
+            legacy_cache_converted = DynamicCache.from_legacy_cache(legacy_results.past_key_values)
+            for layer_idx in range(len(new_cache)):
+                for kv_idx in range(len(new_cache[layer_idx])):
+                    self.assertTrue(
+                        torch.allclose(
+                            new_cache[layer_idx][kv_idx],
+                            legacy_cache_converted[layer_idx][kv_idx],
+                        )
+                    )
+
     def _check_outputs(self, output, input_ids, config, use_cache=False, num_return_sequences=1):
         batch_size, seq_length = input_ids.shape
         num_sequences_in_output = batch_size * num_return_sequences
diff --git a/tests/models/bark/test_modeling_bark.py b/tests/models/bark/test_modeling_bark.py
index 713fb6c3ee790a..1246fa561583a6 100644
--- a/tests/models/bark/test_modeling_bark.py
+++ b/tests/models/bark/test_modeling_bark.py
@@ -890,13 +890,11 @@ def test_flash_attn_2_inference(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
-                )
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
                 model.to(torch_device)
 
                 dummy_input = inputs_dict["input_ids"][:1]
@@ -949,12 +947,13 @@ def test_flash_attn_2_inference_padding_right(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                    tmpdirname,
+                    torch_dtype=torch.bfloat16,
                 )
                 model.to(torch_device)
 
diff --git a/tests/models/beit/test_modeling_beit.py b/tests/models/beit/test_modeling_beit.py
index 8774503e769402..fdf4607d62693f 100644
--- a/tests/models/beit/test_modeling_beit.py
+++ b/tests/models/beit/test_modeling_beit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch BEiT model. """
 
 
-import inspect
 import unittest
 
 from datasets import load_dataset
@@ -26,6 +25,7 @@
 from transformers.testing_utils import require_torch, require_torch_multi_gpu, require_vision, slow, torch_device
 from transformers.utils import cached_property, is_torch_available, is_vision_available
 
+from ...test_backbone_common import BackboneTesterMixin
 from ...test_configuration_common import ConfigTester
 from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
@@ -36,7 +36,9 @@
     from torch import nn
 
     from transformers import (
+        MODEL_FOR_BACKBONE_MAPPING,
         MODEL_MAPPING,
+        BeitBackbone,
         BeitForImageClassification,
         BeitForMaskedImageModeling,
         BeitForSemanticSegmentation,
@@ -64,7 +66,7 @@ def __init__(
         is_training=True,
         use_labels=True,
         hidden_size=32,
-        num_hidden_layers=2,
+        num_hidden_layers=4,
         num_attention_heads=4,
         intermediate_size=37,
         hidden_act="gelu",
@@ -74,10 +76,11 @@ def __init__(
         initializer_range=0.02,
         num_labels=3,
         scope=None,
-        out_indices=[0, 1, 2, 3],
+        out_indices=[1, 2, 3, 4],
+        out_features=["stage1", "stage2", "stage3", "stage4"],
     ):
         self.parent = parent
-        self.vocab_size = 100
+        self.vocab_size = vocab_size
         self.batch_size = batch_size
         self.image_size = image_size
         self.patch_size = patch_size
@@ -95,6 +98,7 @@ def __init__(
         self.initializer_range = initializer_range
         self.scope = scope
         self.out_indices = out_indices
+        self.out_features = out_features
         self.num_labels = num_labels
 
         # in BeiT, the seq length equals the number of patches + 1 (we add 1 for the [CLS] token)
@@ -130,6 +134,7 @@ def get_config(self):
             is_decoder=False,
             initializer_range=self.initializer_range,
             out_indices=self.out_indices,
+            out_features=self.out_features,
         )
 
     def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
@@ -139,6 +144,38 @@ def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
         result = model(pixel_values)
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
 
+    def create_and_check_backbone(self, config, pixel_values, labels, pixel_labels):
+        model = BeitBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify hidden states
+        self.parent.assertEqual(len(result.feature_maps), len(config.out_features))
+        expected_height = expected_width = self.image_size // config.patch_size
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), len(config.out_features))
+
+        # verify backbone works with out_features=None
+        config.out_features = None
+        model = BeitBackbone(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(pixel_values)
+
+        # verify feature maps
+        self.parent.assertEqual(len(result.feature_maps), 1)
+        self.parent.assertListEqual(
+            list(result.feature_maps[0].shape), [self.batch_size, self.hidden_size, expected_height, expected_width]
+        )
+
+        # verify channels
+        self.parent.assertEqual(len(model.channels), 1)
+
     def create_and_check_for_masked_lm(self, config, pixel_values, labels, pixel_labels):
         model = BeitForMaskedImageModeling(config=config)
         model.to(torch_device)
@@ -193,7 +230,13 @@ class BeitModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     """
 
     all_model_classes = (
-        (BeitModel, BeitForImageClassification, BeitForMaskedImageModeling, BeitForSemanticSegmentation)
+        (
+            BeitModel,
+            BeitForImageClassification,
+            BeitForMaskedImageModeling,
+            BeitForSemanticSegmentation,
+            BeitBackbone,
+        )
         if is_torch_available()
         else ()
     )
@@ -227,6 +270,10 @@ def test_inputs_embeds(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
+    @unittest.skip(reason="BEiT does not support feedforward chunking yet")
+    def test_feed_forward_chunking(self):
+        pass
+
     def test_model_common_attributes(self):
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -236,22 +283,14 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
 
+    def test_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_backbone(*config_and_inputs)
+
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
@@ -273,7 +312,11 @@ def test_training(self):
 
         for model_class in self.all_model_classes:
             # we don't test BeitForMaskedImageModeling
-            if model_class in [*get_values(MODEL_MAPPING), BeitForMaskedImageModeling]:
+            if model_class in [
+                *get_values(MODEL_MAPPING),
+                *get_values(MODEL_FOR_BACKBONE_MAPPING),
+                BeitForMaskedImageModeling,
+            ]:
                 continue
 
             model = model_class(config)
@@ -294,7 +337,8 @@ def test_training_gradient_checkpointing(self):
         for model_class in self.all_model_classes:
             # we don't test BeitForMaskedImageModeling
             if (
-                model_class in [*get_values(MODEL_MAPPING), BeitForMaskedImageModeling]
+                model_class
+                in [*get_values(MODEL_MAPPING), *get_values(MODEL_FOR_BACKBONE_MAPPING), BeitForMaskedImageModeling]
                 or not model_class.supports_gradient_checkpointing
             ):
                 continue
@@ -500,3 +544,12 @@ def test_post_processing_semantic_segmentation(self):
         segmentation = image_processor.post_process_semantic_segmentation(outputs=outputs)
         expected_shape = torch.Size((160, 160))
         self.assertEqual(segmentation[0].shape, expected_shape)
+
+
+@require_torch
+class BeitBackboneTest(unittest.TestCase, BackboneTesterMixin):
+    all_model_classes = (BeitBackbone,) if is_torch_available() else ()
+    config_class = BeitConfig
+
+    def setUp(self):
+        self.model_tester = BeitModelTester(self)
diff --git a/tests/models/bit/test_modeling_bit.py b/tests/models/bit/test_modeling_bit.py
index d7d2c60347f69b..03e2bd1095191d 100644
--- a/tests/models/bit/test_modeling_bit.py
+++ b/tests/models/bit/test_modeling_bit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Bit model. """
 
 
-import inspect
 import unittest
 
 from transformers import BitConfig
@@ -202,18 +201,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/clvp/test_modeling_clvp.py b/tests/models/clvp/test_modeling_clvp.py
index 1b3ab79034a917..e27d9e08eb7e12 100644
--- a/tests/models/clvp/test_modeling_clvp.py
+++ b/tests/models/clvp/test_modeling_clvp.py
@@ -38,6 +38,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -281,9 +282,10 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class ClvpDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (ClvpModel, ClvpForCausalLM) if is_torch_available() else ()
     all_generative_model_classes = (ClvpForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = {"feature-extraction": ClvpModelForConditionalGeneration} if is_torch_available() else {}
 
     test_pruning = False
 
@@ -602,12 +604,7 @@ def test_text_and_speech_encoder_models(self):
         text_embeds = self.model.text_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
 
         # fmt: off
-        EXPECTED_TEXT_EMBEDS = torch.tensor(
-            [ 1.8060e+00, -2.7928e+00,  3.2021e+00, -1.5673e+00,  2.3284e+00, -3.2065e+00, -1.3368e+00,  2.2322e+00,
-              -1.7667e+00,  4.1505e-01, 2.4119e+00, -5.8133e-03, -4.6367e+00,  1.6450e-01,  6.7459e+00, 6.6292e+00,
-              1.1046e+00,  3.6196e+00, -1.0496e+01,  5.4924e+00
-            ]
-        )
+        EXPECTED_TEXT_EMBEDS = torch.tensor([1.4798, -2.0005, 2.3902, -0.5042, 1.6401, -2.4135, -1.4800, 3.0118, -2.4422, 1.3266, 2.2339, 1.4761, -4.8983, -1.3592, 6.0251, 6.7364, 2.2576, 3.7229, -10.0436, 4.6676])
         # fmt: on
 
         self.assertTrue(torch.allclose(text_embeds[0, :20], EXPECTED_TEXT_EMBEDS, atol=1e-4))
@@ -616,11 +613,7 @@ def test_text_and_speech_encoder_models(self):
         speech_embeds = self.model.speech_encoder_model(input_ids=self.text_tokens, return_dict=True)[0].cpu()
 
         # fmt: off
-        EXPECTED_SPEECH_EMBEDS = torch.tensor(
-            [ 4.6143, -5.5784,  0.8983, -3.9665, -0.6714, -1.0665, -1.1277,  1.5619, 2.6322, -7.2008, -2.4932,  0.3265,
-              -1.4738,  0.1425,  5.0825,  4.1760, -5.4708,  2.1935, -6.0044,  3.9540
-            ]
-        )
+        EXPECTED_SPEECH_EMBEDS = torch.tensor([3.1202, -3.1183, -1.4264, -6.1339, 1.8885, -0.1983, 0.9461, -1.7414, 0.3320, -3.8400, -1.5715, 1.5096, -1.7576, 0.2387, 4.9758, 5.8450, -6.2534, 2.8587, -5.5816, 4.7821])
         # fmt: on
 
         self.assertTrue(torch.allclose(speech_embeds[0, :20], EXPECTED_SPEECH_EMBEDS, atol=1e-4))
@@ -633,8 +626,10 @@ def test_full_model_integration(self):
             num_beams=4,
             num_return_sequences=4,
             max_new_tokens=10,
-        ).speech_ids.cpu()
+        )
 
-        EXPECTED_OUTPUTS = torch.tensor([[1953, 1080, 612], [1953, 1953, 612], [1953, 612, 716]])
+        EXPECTED_SPEECH_IDS = torch.tensor([[1953, 1080, 612], [1953, 612, 493], [1953, 612, 716]])
+        EXPECTED_SIMILARITY_SCORES = torch.tensor([[14.7660, 14.4569, 13.6472, 13.5683]])
 
-        self.assertTrue(torch.allclose(full_model_output[-3:, -3:], EXPECTED_OUTPUTS))
+        self.assertTrue(torch.allclose(full_model_output.speech_ids.cpu()[-3:, -3:], EXPECTED_SPEECH_IDS))
+        self.assertTrue(torch.allclose(full_model_output.logits_per_text.cpu(), EXPECTED_SIMILARITY_SCORES))
diff --git a/tests/models/conditional_detr/test_image_processing_conditional_detr.py b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
index cc3fd501a64e10..4b18a6ecd7faf0 100644
--- a/tests/models/conditional_detr/test_image_processing_conditional_detr.py
+++ b/tests/models/conditional_detr/test_image_processing_conditional_detr.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class ConditionalDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class ConditionalDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = ConditionalDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/convnext/test_modeling_convnext.py b/tests/models/convnext/test_modeling_convnext.py
index 397fa596f10230..ac2b6f927c8dfc 100644
--- a/tests/models/convnext/test_modeling_convnext.py
+++ b/tests/models/convnext/test_modeling_convnext.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ConvNext model. """
 
 
-import inspect
 import unittest
 
 from transformers import ConvNextConfig
@@ -212,18 +211,6 @@ def test_model_common_attributes(self):
     def test_feed_forward_chunking(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/convnextv2/test_modeling_convnextv2.py b/tests/models/convnextv2/test_modeling_convnextv2.py
index c3f8804f1ccad9..694901a1846994 100644
--- a/tests/models/convnextv2/test_modeling_convnextv2.py
+++ b/tests/models/convnextv2/test_modeling_convnextv2.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ConvNextV2 model. """
 
 
-import inspect
 import unittest
 
 from transformers import ConvNextV2Config
@@ -265,18 +264,6 @@ def test_training_gradient_checkpointing(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/cvt/test_modeling_cvt.py b/tests/models/cvt/test_modeling_cvt.py
index 6f4f63f0f9df79..4abeb5571c7b58 100644
--- a/tests/models/cvt/test_modeling_cvt.py
+++ b/tests/models/cvt/test_modeling_cvt.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch CvT model. """
 
 
-import inspect
 import unittest
 from math import floor
 
@@ -191,18 +190,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/data2vec/test_modeling_data2vec_vision.py b/tests/models/data2vec/test_modeling_data2vec_vision.py
index 69a763a4f2ecd9..bdb95588ac5cb7 100644
--- a/tests/models/data2vec/test_modeling_data2vec_vision.py
+++ b/tests/models/data2vec/test_modeling_data2vec_vision.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Data2VecVision model. """
 
 
-import inspect
 import unittest
 
 from transformers import Data2VecVisionConfig
@@ -220,18 +219,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/deformable_detr/test_image_processing_deformable_detr.py b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
index 4fd2de49f78ae4..ec65f7b9a58602 100644
--- a/tests/models/deformable_detr/test_image_processing_deformable_detr.py
+++ b/tests/models/deformable_detr/test_image_processing_deformable_detr.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class DeformableDetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DeformableDetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DeformableDetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/deit/test_modeling_deit.py b/tests/models/deit/test_modeling_deit.py
index 4a9945a731fd8d..9cd5be8fd3752c 100644
--- a/tests/models/deit/test_modeling_deit.py
+++ b/tests/models/deit/test_modeling_deit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch DeiT model. """
 
 
-import inspect
 import unittest
 import warnings
 
@@ -238,18 +237,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/deta/test_image_processing_deta.py b/tests/models/deta/test_image_processing_deta.py
index 7cde8474bf1019..1e481476077d2b 100644
--- a/tests/models/deta/test_image_processing_deta.py
+++ b/tests/models/deta/test_image_processing_deta.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class DetaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DetaImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DetaImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/deta/test_modeling_deta.py b/tests/models/deta/test_modeling_deta.py
index d5bf32acaba7e0..8581723ccb3b72 100644
--- a/tests/models/deta/test_modeling_deta.py
+++ b/tests/models/deta/test_modeling_deta.py
@@ -162,6 +162,26 @@ def create_and_check_deta_model(self, config, pixel_values, pixel_mask, labels):
 
         self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size))
 
+    def create_and_check_deta_freeze_backbone(self, config, pixel_values, pixel_mask, labels):
+        model = DetaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model.freeze_backbone()
+
+        for _, param in model.backbone.model.named_parameters():
+            self.parent.assertEqual(False, param.requires_grad)
+
+    def create_and_check_deta_unfreeze_backbone(self, config, pixel_values, pixel_mask, labels):
+        model = DetaModel(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        model.unfreeze_backbone()
+
+        for _, param in model.backbone.model.named_parameters():
+            self.parent.assertEqual(True, param.requires_grad)
+
     def create_and_check_deta_object_detection_head_model(self, config, pixel_values, pixel_mask, labels):
         model = DetaForObjectDetection(config=config)
         model.to(torch_device)
@@ -250,6 +270,14 @@ def test_deta_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_deta_model(*config_and_inputs)
 
+    def test_deta_freeze_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deta_freeze_backbone(*config_and_inputs)
+
+    def test_deta_unfreeze_backbone(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_deta_unfreeze_backbone(*config_and_inputs)
+
     def test_deta_object_detection_head_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_deta_object_detection_head_model(*config_and_inputs)
diff --git a/tests/models/detr/test_image_processing_detr.py b/tests/models/detr/test_image_processing_detr.py
index 2a095b259ecda6..7a5cb9efed6fe0 100644
--- a/tests/models/detr/test_image_processing_detr.py
+++ b/tests/models/detr/test_image_processing_detr.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class DetrImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class DetrImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = DetrImageProcessor if is_vision_available() else None
 
     def setUp(self):
@@ -159,6 +159,63 @@ def test_image_processor_from_dict_with_kwargs(self):
         self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84})
         self.assertEqual(image_processor.do_pad, False)
 
+    def test_should_raise_if_annotation_format_invalid(self):
+        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            detection_target = json.loads(f.read())
+
+        annotations = {"image_id": 39769, "annotations": detection_target}
+
+        params = {
+            "images": Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png"),
+            "annotations": annotations,
+            "return_tensors": "pt",
+        }
+
+        image_processor_params = {**image_processor_dict, **{"format": "_INVALID_FORMAT_"}}
+        image_processor = self.image_processing_class(**image_processor_params)
+
+        with self.assertRaises(ValueError) as e:
+            image_processor(**params)
+
+        self.assertTrue(str(e.exception).startswith("_INVALID_FORMAT_ is not a valid AnnotationFormat"))
+
+    def test_valid_coco_detection_annotations(self):
+        # prepare image and target
+        image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+        with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f:
+            target = json.loads(f.read())
+
+        params = {"image_id": 39769, "annotations": target}
+
+        # encode them
+        image_processing = DetrImageProcessor.from_pretrained("facebook/detr-resnet-50")
+
+        # legal encodings (single image)
+        _ = image_processing(images=image, annotations=params, return_tensors="pt")
+        _ = image_processing(images=image, annotations=[params], return_tensors="pt")
+
+        # legal encodings (batch of one image)
+        _ = image_processing(images=[image], annotations=params, return_tensors="pt")
+        _ = image_processing(images=[image], annotations=[params], return_tensors="pt")
+
+        # legal encoding (batch of more than one image)
+        n = 5
+        _ = image_processing(images=[image] * n, annotations=[params] * n, return_tensors="pt")
+
+        # example of an illegal encoding (missing the 'image_id' key)
+        with self.assertRaises(ValueError) as e:
+            image_processing(images=image, annotations={"annotations": target}, return_tensors="pt")
+
+        self.assertTrue(str(e.exception).startswith("Invalid COCO detection annotations"))
+
+        # example of an illegal encoding (unequal lengths of images and annotations)
+        with self.assertRaises(ValueError) as e:
+            image_processing(images=[image] * n, annotations=[params] * (n - 1), return_tensors="pt")
+
+        self.assertTrue(str(e.exception) == "The number of images (5) and annotations (4) do not match.")
+
     @slow
     def test_call_pytorch_with_coco_detection_annotations(self):
         # prepare image and target
diff --git a/tests/models/dinat/test_modeling_dinat.py b/tests/models/dinat/test_modeling_dinat.py
index a7e0b7d0650e6e..c824060cf816b2 100644
--- a/tests/models/dinat/test_modeling_dinat.py
+++ b/tests/models/dinat/test_modeling_dinat.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Dinat model. """
 
 import collections
-import inspect
 import unittest
 
 from transformers import DinatConfig
@@ -264,18 +263,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         self.skipTest("Dinat's attention operation is handled entirely by NATTEN.")
 
diff --git a/tests/models/dinov2/test_modeling_dinov2.py b/tests/models/dinov2/test_modeling_dinov2.py
index a040356fb798ee..8e68165754b0ed 100644
--- a/tests/models/dinov2/test_modeling_dinov2.py
+++ b/tests/models/dinov2/test_modeling_dinov2.py
@@ -15,11 +15,11 @@
 """ Testing suite for the PyTorch Dinov2 model. """
 
 
-import inspect
 import unittest
 
 from transformers import Dinov2Config
 from transformers.testing_utils import (
+    is_flaky,
     require_torch,
     require_vision,
     slow,
@@ -221,7 +221,7 @@ class Dinov2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
         if is_torch_available()
         else {}
     )
-    fx_compatible = False
+    fx_compatible = True
 
     test_pruning = False
     test_resize_embeddings = False
@@ -231,6 +231,10 @@ def setUp(self):
         self.model_tester = Dinov2ModelTester(self)
         self.config_tester = ConfigTester(self, config_class=Dinov2Config, has_text_modality=False, hidden_size=37)
 
+    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
+    def test_initialization(self):
+        super().test_initialization()
+
     def test_config(self):
         self.config_tester.run_common_tests()
 
@@ -265,18 +269,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/distilbert/test_modeling_distilbert.py b/tests/models/distilbert/test_modeling_distilbert.py
index 8194c4285916de..9ab9d01577a974 100644
--- a/tests/models/distilbert/test_modeling_distilbert.py
+++ b/tests/models/distilbert/test_modeling_distilbert.py
@@ -319,13 +319,11 @@ def test_flash_attn_2_inference(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
-                )
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16)
                 model.to(torch_device)
 
                 logits = model(dummy_input, output_hidden_states=True).hidden_states[-1]
@@ -373,12 +371,13 @@ def test_flash_attn_2_inference_padding_right(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                    tmpdirname,
+                    torch_dtype=torch.bfloat16,
                 )
                 model.to(torch_device)
 
diff --git a/tests/models/donut/test_modeling_donut_swin.py b/tests/models/donut/test_modeling_donut_swin.py
index 2a0d9f5e17cbdb..e52e679e42e682 100644
--- a/tests/models/donut/test_modeling_donut_swin.py
+++ b/tests/models/donut/test_modeling_donut_swin.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Donut Swin model. """
 
 import collections
-import inspect
 import unittest
 
 from transformers import DonutSwinConfig
@@ -186,18 +185,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/dpt/test_modeling_dpt.py b/tests/models/dpt/test_modeling_dpt.py
index 100803bcdebfeb..0b398c923e686f 100644
--- a/tests/models/dpt/test_modeling_dpt.py
+++ b/tests/models/dpt/test_modeling_dpt.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch DPT model. """
 
 
-import inspect
 import unittest
 
 from transformers import DPTConfig
@@ -195,18 +194,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/dpt/test_modeling_dpt_auto_backbone.py b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
index 95e3128ff0edeb..76ab220583fed8 100644
--- a/tests/models/dpt/test_modeling_dpt_auto_backbone.py
+++ b/tests/models/dpt/test_modeling_dpt_auto_backbone.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch DPT model. """
 
 
-import inspect
 import unittest
 
 from transformers import Dinov2Config, DPTConfig
@@ -154,18 +153,6 @@ def test_config(self):
     def test_inputs_embeds(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_for_depth_estimation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_depth_estimation(*config_and_inputs)
diff --git a/tests/models/dpt/test_modeling_dpt_hybrid.py b/tests/models/dpt/test_modeling_dpt_hybrid.py
index 82055b210557f9..6898637951411c 100644
--- a/tests/models/dpt/test_modeling_dpt_hybrid.py
+++ b/tests/models/dpt/test_modeling_dpt_hybrid.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch DPT model. """
 
 
-import inspect
 import unittest
 
 from transformers import DPTConfig
@@ -209,18 +208,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/efficientformer/test_modeling_efficientformer.py b/tests/models/efficientformer/test_modeling_efficientformer.py
index 2774a210da567e..73283fbbf60026 100644
--- a/tests/models/efficientformer/test_modeling_efficientformer.py
+++ b/tests/models/efficientformer/test_modeling_efficientformer.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch EfficientFormer model. """
 
 
-import inspect
 import unittest
 import warnings
 from typing import List
@@ -223,18 +222,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/efficientnet/test_modeling_efficientnet.py b/tests/models/efficientnet/test_modeling_efficientnet.py
index 38a359c574f801..32050e3d21a5e1 100644
--- a/tests/models/efficientnet/test_modeling_efficientnet.py
+++ b/tests/models/efficientnet/test_modeling_efficientnet.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch EfficientNet model. """
 
 
-import inspect
 import unittest
 
 from transformers import EfficientNetConfig
@@ -172,18 +171,6 @@ def test_model_common_attributes(self):
     def test_feed_forward_chunking(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py
index 5956a9ed6bf02f..fa7ea2af816cb0 100644
--- a/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/models/falcon/test_modeling_falcon.py
@@ -15,6 +15,7 @@
 """ Testing suite for the PyTorch Falcon model. """
 
 
+import tempfile
 import unittest
 
 from parameterized import parameterized
@@ -26,7 +27,7 @@
     is_torch_available,
     set_seed,
 )
-from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+from transformers.testing_utils import require_bitsandbytes, require_torch, require_torch_sdpa, slow, torch_device
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -298,6 +299,12 @@ class FalconModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix
     test_headmasking = False
     test_pruning = False
 
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def setUp(self):
         self.model_tester = FalconModelTester(self)
         self.config_tester = ConfigTester(self, config_class=FalconConfig, hidden_size=37)
@@ -431,6 +438,76 @@ def test_model_rope_scaling(self, scaling_type):
         # The output should be different for long inputs
         self.assertFalse(torch.allclose(original_long_output, scaled_long_output, atol=1e-5))
 
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        max_new_tokens = 30
+
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_sdpa:
+                self.skipTest(f"{model_class.__name__} does not support SDPA")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+
+                model_sdpa = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    attn_implementation="eager",
+                ).to(torch_device)
+
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+                # NOTE: This check is disabled for Falcon as the non-SDPA/SDPA implementation is in the same class (legacy reason).
+                # for name, submodule in model_eager.named_modules():
+                #     if "SdpaAttention" in submodule.__class__.__name__:
+                #         raise ValueError("The eager model should not have SDPA attention layers")
+
+                # has_sdpa = False
+                # for name, submodule in model_sdpa.named_modules():
+                #     if "SdpaAttention" in submodule.__class__.__name__:
+                #         has_sdpa = True
+                #         break
+                # if not has_sdpa:
+                #     raise ValueError("The SDPA model should have SDPA attention layers")
+
+                # Just test that a large cache works as expected
+                res_eager = model_eager.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                res_sdpa = model_sdpa.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                self.assertTrue(torch.allclose(res_eager, res_sdpa))
+
 
 @require_torch
 class FalconLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/focalnet/test_modeling_focalnet.py b/tests/models/focalnet/test_modeling_focalnet.py
index ce96f0ade414fc..6de095d975234d 100644
--- a/tests/models/focalnet/test_modeling_focalnet.py
+++ b/tests/models/focalnet/test_modeling_focalnet.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch FocalNet model. """
 
 import collections
-import inspect
 import unittest
 
 from transformers import FocalNetConfig
@@ -299,18 +298,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes[:-1]:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def check_hidden_states_output(self, inputs_dict, config, model_class, image_size):
         model = model_class(config)
         model.to(torch_device)
diff --git a/tests/models/fuyu/test_modeling_fuyu.py b/tests/models/fuyu/test_modeling_fuyu.py
index 62d58127973dfc..84c91288927726 100644
--- a/tests/models/fuyu/test_modeling_fuyu.py
+++ b/tests/models/fuyu/test_modeling_fuyu.py
@@ -24,6 +24,7 @@
 from transformers.utils import cached_property
 
 from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_vision_available():
@@ -262,9 +263,9 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class FuyuModelTest(ModelTesterMixin, unittest.TestCase):
+class FuyuModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
-    pipeline_model_mapping = {"image-to-text": FuyuForCausalLM} if is_torch_available() else {}
+    pipeline_model_mapping = {"text-generation": FuyuForCausalLM} if is_torch_available() else {}
 
     test_head_masking = False
     test_pruning = False
@@ -293,6 +294,21 @@ def test_training_gradient_checkpointing_use_reentrant(self):
     def test_training_gradient_checkpointing_use_reentrant_false(self):
         pass
 
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model.")
+    def test_disk_offload_bin(self):
+        super().test_disk_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model.")
+    def test_disk_offload_safetensors(self):
+        super().test_disk_offload()
+
+    # TODO: Fix me (once this model gets more usage)
+    @unittest.skip("Does not work on the tiny model.")
+    def test_model_parallelism(self):
+        super().test_model_parallelism()
+
 
 @slow
 @require_torch_gpu
diff --git a/tests/models/glpn/test_modeling_glpn.py b/tests/models/glpn/test_modeling_glpn.py
index 60e29b739f26bc..138a8cf2832eef 100644
--- a/tests/models/glpn/test_modeling_glpn.py
+++ b/tests/models/glpn/test_modeling_glpn.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch GLPN model. """
 
 
-import inspect
 import unittest
 
 from transformers import is_torch_available, is_vision_available
@@ -177,18 +176,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py
index ffd46dd197dc83..28530c72194585 100644
--- a/tests/models/idefics/test_modeling_idefics.py
+++ b/tests/models/idefics/test_modeling_idefics.py
@@ -16,11 +16,14 @@
 
 import unittest
 
+from parameterized import parameterized
+
 from transformers import BitsAndBytesConfig, IdeficsConfig, is_torch_available, is_vision_available
 from transformers.testing_utils import (
     TestCasePlus,
     require_bitsandbytes,
     require_torch,
+    require_torch_sdpa,
     require_vision,
     slow,
     torch_device,
@@ -71,6 +74,7 @@ def __init__(
         type_vocab_size=16,
         type_sequence_label_size=2,
         initializer_range=0.02,
+        alpha_initializer="ones",
         num_labels=3,
         scope=None,
         modality_type_vocab_size=2,
@@ -108,6 +112,7 @@ def __init__(
         self.type_vocab_size = type_vocab_size
         self.type_sequence_label_size = type_sequence_label_size
         self.initializer_range = initializer_range
+        self.alpha_initializer = alpha_initializer
         self.num_labels = num_labels
         self.scope = scope
         self.modality_type_vocab_size = modality_type_vocab_size
@@ -167,6 +172,57 @@ def prepare_config_and_inputs(self, num_images=1, interpolate_pos_encoding=False
         config = self.get_config()
         return (config, input_ids, input_mask, pixel_values, image_attention_mask, interpolate_pos_encoding)
 
+    def prepare_config_and_inputs_gate_tests(self):
+        # Create a list of configs and inputs, to test 2 things:
+        # 1. For the same image, the output should be different when image_attention_mask is filled with 0s vs filled with 1s.
+        # 2. For 2 different images, the output should be the same when image_attention_mask is filled with 0s.
+
+        interpolate_pos_encoding = False
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                1,
+                self.num_channels,
+                self.image_size,
+                self.image_size,
+            ]
+        )
+        pixel_values_list = [
+            pixel_values.clone(),
+            pixel_values.clone(),
+            pixel_values.clone().fill_(0.6),
+            pixel_values.clone().fill_(0.3),
+        ]
+        attention_mask = None
+        if self.use_input_mask:
+            attention_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        image_attention_mask = random_attention_mask([self.batch_size, self.seq_length, 1])
+        image_attention_mask_list = [
+            image_attention_mask.clone().fill_(0),
+            image_attention_mask.clone().fill_(1),
+            image_attention_mask.clone().fill_(0),
+            image_attention_mask.clone().fill_(0),
+        ]
+
+        config = self.get_config()
+        inputs_list = []
+        for pixel_values, image_attention_mask in zip(pixel_values_list, image_attention_mask_list):
+            inputs_list.append(
+                {
+                    "input_ids": input_ids,
+                    "attention_mask": attention_mask,
+                    "pixel_values": pixel_values,
+                    "image_attention_mask": image_attention_mask,
+                    "interpolate_pos_encoding": interpolate_pos_encoding,
+                }
+            )
+
+        inputs_w_same_img = inputs_list[:2]
+        inputs_w_0_img_attn = inputs_list[2:]
+        return config, inputs_w_same_img, inputs_w_0_img_attn
+
     def get_config(self):
         return IdeficsConfig(
             image_size=self.image_size,
@@ -184,6 +240,7 @@ def get_config(self):
             type_vocab_size=self.type_vocab_size,
             is_decoder=False,
             initializer_range=self.initializer_range,
+            alpha_initializer=self.alpha_initializer,
             num_labels=self.num_labels,
             modality_type_vocab_size=self.modality_type_vocab_size,
             vision_config=self.vision_config,
@@ -255,6 +312,12 @@ def prepare_config_and_inputs_for_common(self):
     def prepare_pixel_values(self):
         return floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
 
+    @require_torch_sdpa
+    @slow
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        self.skipTest("Idefics has a hard requirement on SDPA, skipping this test")
+
 
 @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
 @require_torch
@@ -337,6 +400,26 @@ def test_generate_with_image_pos_embeddings_interpolation_multiple_images(self):
         )
         self.model_tester.create_and_check_model_gen(*config_and_inputs)
 
+    def test_cross_attention_gates(self):
+        config, inputs_w_same_img, inputs_w_0_img_attn = self.model_tester.prepare_config_and_inputs_gate_tests()
+
+        model = IdeficsModel(config=config).to(torch_device)
+        model.eval()
+        test_1_results = []
+        for inputs in inputs_w_same_img:
+            with torch.no_grad():
+                last_hidden_states = model(**inputs).last_hidden_state
+            last_hidden_states = model(**inputs).last_hidden_state
+            test_1_results.append(last_hidden_states)
+        self.assertNotEqual(test_1_results[0].sum().item(), test_1_results[1].sum().item())
+
+        test_2_results = []
+        for inputs in inputs_w_0_img_attn:
+            with torch.no_grad():
+                last_hidden_states = model(**inputs).last_hidden_state
+            test_2_results.append(last_hidden_states)
+        self.assertEqual(test_2_results[0].sum().item(), test_2_results[1].sum().item())
+
     def test_training(self):
         if not self.model_tester.is_training:
             return
@@ -483,6 +566,12 @@ def test_model_from_pretrained(self):
             model = IdeficsModel.from_pretrained(model_name)
             self.assertIsNotNone(model)
 
+    @require_torch_sdpa
+    @slow
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        self.skipTest("Idefics has a hard requirement on SDPA, skipping this test")
+
 
 @unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
 @require_torch
diff --git a/tests/models/kosmos2/test_modeling_kosmos2.py b/tests/models/kosmos2/test_modeling_kosmos2.py
index 5491ded1bc8152..dd953eedc8810d 100644
--- a/tests/models/kosmos2/test_modeling_kosmos2.py
+++ b/tests/models/kosmos2/test_modeling_kosmos2.py
@@ -37,6 +37,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -244,15 +245,26 @@ def prepare_config_and_inputs_for_common(self):
 
 
 @require_torch
-class Kosmos2ModelTest(ModelTesterMixin, unittest.TestCase):
+class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Kosmos2Model, Kosmos2ForConditionalGeneration) if is_torch_available() else ()
     all_generative_model_classes = (Kosmos2ForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": Kosmos2Model, "image-to-text": Kosmos2ForConditionalGeneration}
+        if is_torch_available()
+        else {}
+    )
     fx_compatible = False
     test_head_masking = False
     test_pruning = False
     test_resize_embeddings = False
     test_attention_outputs = False
 
+    # TODO: `image-to-text` pipeline for this model needs Processor.
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return pipeline_test_casse_name == "ImageToTextPipelineTests"
+
     def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
         inputs_dict = copy.deepcopy(inputs_dict)
 
diff --git a/tests/models/kosmos2/test_processor_kosmos2.py b/tests/models/kosmos2/test_processor_kosmos2.py
index e2147ee0608592..c3dd8c4dba586f 100644
--- a/tests/models/kosmos2/test_processor_kosmos2.py
+++ b/tests/models/kosmos2/test_processor_kosmos2.py
@@ -55,7 +55,7 @@ class Kosmos2ProcessorTest(unittest.TestCase):
     def setUp(self):
         self.tmpdirname = tempfile.mkdtemp()
 
-        image_processor = CLIPImageProcessor(use_square_size=True)
+        image_processor = CLIPImageProcessor()
 
         # We have a SentencePiece fixture for testing
         slow_tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB)
diff --git a/tests/models/levit/test_modeling_levit.py b/tests/models/levit/test_modeling_levit.py
index 0e46f6f56dd731..d569b2b5385235 100644
--- a/tests/models/levit/test_modeling_levit.py
+++ b/tests/models/levit/test_modeling_levit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch LeViT model. """
 
 
-import inspect
 import unittest
 import warnings
 from math import ceil, floor
@@ -218,18 +217,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_hidden_states_output(self):
         def check_hidden_states_output(inputs_dict, config, model_class):
             model = model_class(config)
diff --git a/tests/models/llama/test_modeling_flax_llama.py b/tests/models/llama/test_modeling_flax_llama.py
new file mode 100644
index 00000000000000..a81398786b43b6
--- /dev/null
+++ b/tests/models/llama/test_modeling_flax_llama.py
@@ -0,0 +1,261 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+
+import numpy as np
+
+from transformers import LlamaConfig, is_flax_available, is_tokenizers_available
+from transformers.testing_utils import require_flax, slow
+
+from ...generation.test_flax_utils import FlaxGenerationTesterMixin
+from ...test_modeling_flax_common import FlaxModelTesterMixin, ids_tensor
+
+
+if is_flax_available():
+    import jax.numpy as jnp
+
+    from transformers.models.llama.modeling_flax_llama import FlaxLlamaForCausalLM, FlaxLlamaModel
+
+
+if is_tokenizers_available():
+    from transformers import LlamaTokenizerFast
+
+
+class FlaxLlamaModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=2,
+        intermediate_size=64,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        window_size=7,
+        initializer_range=0.02,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.window_size = window_size
+        self.initializer_range = initializer_range
+        self.scope = None
+        self.bos_token_id = vocab_size - 1
+        self.eos_token_id = vocab_size - 1
+        self.pad_token_id = vocab_size - 1
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = np.tril(np.ones((self.batch_size, self.seq_length)))
+
+        config = LlamaConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            use_cache=True,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+        )
+
+        return (config, input_ids, input_mask)
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+    def check_use_cache_forward(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        attention_mask = jnp.ones((input_ids.shape[0], max_decoder_length), dtype="i4")
+
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            attention_mask=attention_mask,
+            past_key_values=outputs_cache.past_key_values,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+    def check_use_cache_forward_with_attn_mask(self, model_class_name, config, input_ids, attention_mask):
+        max_decoder_length = 20
+        model = model_class_name(config)
+
+        attention_mask_cache = jnp.concatenate(
+            [attention_mask, jnp.zeros((attention_mask.shape[0], max_decoder_length - attention_mask.shape[1]))],
+            axis=-1,
+        )
+
+        past_key_values = model.init_cache(input_ids.shape[0], max_decoder_length)
+        position_ids = jnp.broadcast_to(
+            jnp.arange(input_ids.shape[-1] - 1)[None, :], (input_ids.shape[0], input_ids.shape[-1] - 1)
+        )
+
+        outputs_cache = model(
+            input_ids[:, :-1],
+            attention_mask=attention_mask_cache,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        position_ids = jnp.array(input_ids.shape[0] * [[input_ids.shape[-1] - 1]], dtype="i4")
+        outputs_cache_next = model(
+            input_ids[:, -1:],
+            past_key_values=outputs_cache.past_key_values,
+            attention_mask=attention_mask_cache,
+            position_ids=position_ids,
+        )
+
+        outputs = model(input_ids, attention_mask=attention_mask)
+
+        diff = np.max(np.abs((outputs_cache_next[0][:, -1, :5] - outputs[0][:, -1, :5])))
+        self.parent.assertTrue(diff < 1e-3, msg=f"Max diff is {diff}")
+
+
+@require_flax
+class FlaxLlamaModelTest(FlaxModelTesterMixin, FlaxGenerationTesterMixin, unittest.TestCase):
+    all_model_classes = (FlaxLlamaModel, FlaxLlamaForCausalLM) if is_flax_available() else ()
+    all_generative_model_classes = (FlaxLlamaForCausalLM,) if is_flax_available() else ()
+
+    def setUp(self):
+        self.model_tester = FlaxLlamaModelTester(self)
+
+    def test_use_cache_forward(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward(model_class_name, config, input_ids, attention_mask)
+
+    def test_use_cache_forward_with_attn_mask(self):
+        for model_class_name in self.all_model_classes:
+            config, input_ids, attention_mask = self.model_tester.prepare_config_and_inputs()
+            self.model_tester.check_use_cache_forward_with_attn_mask(
+                model_class_name, config, input_ids, attention_mask
+            )
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_class_name in self.all_model_classes:
+            model = model_class_name.from_pretrained("openlm-research/open_llama_3b_v2", from_pt=True)
+            outputs = model(np.ones((1, 1)))
+            self.assertIsNotNone(outputs)
+
+
+@slow
+@require_flax
+class FlaxLlamaIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.model_id = "openlm-research/open_llama_3b_v2"
+        self.model = FlaxLlamaForCausalLM.from_pretrained(self.model_id, from_pt=True)
+        self.test_batch = jnp.arange(32).reshape(4, 8) + 1911
+
+    def test_model_logits(self):
+        flax_logits = self.model(self.test_batch).logits
+
+        # fmt: off
+        EXPECTED_LOGITS = [-74.4243, -74.0680, -65.2507, -79.1658, -77.7460, -69.2379, -86.4588, -84.8933, -77.8456]
+        EXPECTED_MIN, EXPECTED_MAX, EXPECTED_MEAN = -96.9952
+        EXPECTED_MAX = -18.4571
+        EXPECTED_MEAN = -65.0608
+        # fmt: on
+
+        self.assertTrue(np.allclose(flax_logits[0, :3, :3].flatten(), EXPECTED_LOGITS, atol=1e-4))
+        self.assertAlmostEqual(flax_logits.min(), EXPECTED_MIN, places=3)
+        self.assertAlmostEqual(flax_logits.max(), EXPECTED_MAX, places=3)
+        self.assertAlmostEqual(flax_logits.mean(), EXPECTED_MEAN, places=3)
+
+    def test_model_hidden_states(self):
+        flax_hidden_states = self.model(self.test_batch, output_hidden_states=True).hidden_states
+        flax_hidden_means = [h.mean() for h in flax_hidden_states]
+
+        # fmt: off
+        EXPECTED_HIDDEN_MEANS = [
+            -0.00007,-0.00049,-0.00169,-0.00253,-0.00271,
+            -0.00290,-0.00252,0.00230,0.00230,0.00198,
+            0.00196,0.00174,0.00246,0.00205,0.00242,
+            0.00171,0.00092,0.00054,0.00102,0.00024,
+            0.00029,0.00037,-0.00101,-0.00062,-0.00341,-0.00636,-0.00357
+        ]
+        # fmt: on
+
+        self.assertTrue(np.allclose(flax_hidden_means, EXPECTED_HIDDEN_MEANS, atol=1e-4))
+
+    def test_generated_text(self):
+        tokenizer = LlamaTokenizerFast.from_pretrained(self.model_id)
+        tokenizer.pad_token_id = 2
+        test_batch = ["Aloha, World! ", "2 + 2 = ", "Paris is the capital of ", "我很高興認識"]
+
+        inputs = tokenizer(test_batch, return_tensors="np", truncation=True, padding=True)
+        generated_ids = self.model.generate(**inputs, max_length=15).sequences
+        generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
+
+        # fmt: off
+        EXPECTED_GENERATION = [
+            "Aloha, World! 201",
+            "2 + 2 = 4\n2",
+            "Paris is the capital of Île-",
+            "我很高興認識你，我"
+        ]
+        # fmt: on
+
+        self.assertListEqual(generated_text, EXPECTED_GENERATION)
diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py
index 21fb4f44d2b8d0..427f94f873cff2 100644
--- a/tests/models/llama/test_modeling_llama.py
+++ b/tests/models/llama/test_modeling_llama.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 """ Testing suite for the PyTorch LLaMA model. """
 
-
+import tempfile
 import unittest
 
 import pytest
@@ -22,17 +22,19 @@
 
 from transformers import LlamaConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
+    require_bitsandbytes,
     require_flash_attn,
     require_torch,
     require_torch_accelerator,
     require_torch_gpu,
+    require_torch_sdpa,
     slow,
     torch_device,
 )
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -104,7 +106,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -385,6 +387,7 @@ def test_model_rope_scaling(self, scaling_type):
 
     @require_flash_attn
     @require_torch_gpu
+    @require_bitsandbytes
     @pytest.mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_padding_right(self):
@@ -410,7 +413,7 @@ def test_flash_attn_2_generate_padding_right(self):
         output_native = tokenizer.batch_decode(output_native)
 
         model = LlamaForCausalLM.from_pretrained(
-            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, use_flash_attention_2=True
+            "meta-llama/Llama-2-7b-hf", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
         )
 
         output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
@@ -418,6 +421,85 @@ def test_flash_attn_2_generate_padding_right(self):
 
         self.assertListEqual(output_native, output_fa_2)
 
+    @require_flash_attn
+    @require_torch_gpu
+    @slow
+    def test_use_flash_attention_2_true(self):
+        """
+        NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
+        """
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        for model_class in self.all_model_classes:
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                model = model_class(config)
+                model.save_pretrained(tmp_dir)
+
+                new_model = LlamaForCausalLM.from_pretrained(
+                    tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
+                ).to("cuda")
+
+                self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
+
+                has_flash = False
+                for name, submodule in new_model.named_modules():
+                    if "FlashAttention" in submodule.__class__.__name__:
+                        has_flash = True
+                        break
+                if not has_flash:
+                    raise ValueError("The flash model should have flash attention layers")
+
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        max_new_tokens = 30
+
+        tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+        model_sdpa = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+        ).to(torch_device)
+
+        self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+        model_eager = LlamaForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf",
+            torch_dtype=torch.float16,
+            low_cpu_mem_usage=True,
+            attn_implementation="eager",
+        ).to(torch_device)
+
+        self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+        for name, submodule in model_eager.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                raise ValueError("The eager model should not have SDPA attention layers")
+
+        has_sdpa = False
+        for name, submodule in model_sdpa.named_modules():
+            if "SdpaAttention" in submodule.__class__.__name__:
+                has_sdpa = True
+                break
+        if not has_sdpa:
+            raise ValueError("The SDPA model should have SDPA attention layers")
+
+        texts = ["hi", "Hello this is a very long sentence my friend", "Today I am in Paris and"]
+
+        for padding_side in ["left", "right"]:
+            tokenizer.padding_side = padding_side
+            tokenizer.pad_token = tokenizer.eos_token
+
+            inputs = tokenizer(texts, return_tensors="pt", padding=True).to(torch_device)
+
+            res_eager = model_eager.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+
+            res_sdpa = model_sdpa.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False)
+            self.assertTrue(torch.allclose(res_eager, res_sdpa))
+
 
 @require_torch
 class LlamaIntegrationTest(unittest.TestCase):
diff --git a/tests/models/transfo_xl/__init__.py b/tests/models/llava/__init__.py
similarity index 100%
rename from tests/models/transfo_xl/__init__.py
rename to tests/models/llava/__init__.py
diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py
new file mode 100644
index 00000000000000..23daaecc4e62c0
--- /dev/null
+++ b/tests/models/llava/test_modeling_llava.py
@@ -0,0 +1,285 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Llava model. """
+
+import gc
+import unittest
+
+import requests
+
+from transformers import (
+    AutoProcessor,
+    LlavaConfig,
+    LlavaForConditionalGeneration,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+class LlavaVisionText2TextModelTester:
+    def __init__(
+        self,
+        parent,
+        ignore_index=-100,
+        image_token_index=0,
+        projector_hidden_act="gelu",
+        seq_length=7,
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-1,
+        text_config={
+            "model_type": "llama",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 0,
+        },
+        is_training=True,
+        vision_config={
+            "batch_size": 12,
+            "image_size": 30,
+            "patch_size": 2,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "projection_dim": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.seq_length = seq_length
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = 3
+        self.image_size = 336
+        self.encoder_seq_length = 231
+
+    def get_config(self):
+        return LlavaConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            ignore_index=self.ignore_index,
+            image_token_index=self.image_token_index,
+            projector_hidden_act=self.projector_hidden_act,
+            vision_feature_select_strategy=self.vision_feature_select_strategy,
+            vision_feature_layer=self.vision_feature_layer,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(1).to(torch_device)
+        # we are giving 3 images let's make sure we pass in 3 image tokens
+        input_ids[:, 1] = config.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class LlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `LlavaForConditionalGeneration`.
+    """
+
+    all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
+    pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = LlavaVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=LlavaConfig, has_text_modality=False)
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+
+@require_torch
+class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("llava-hf/bakLlava-v1-hf")
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
+
+        prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        image_file = "https://llava-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = self.processor(prompt, raw_image, return_tensors="pt")
+
+        EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]])  # fmt: skip
+        self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
+
+        output = model.generate(**inputs, max_new_tokens=20)
+        EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly,"  # fmt: skip
+
+        self.assertEqual(
+            self.processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "llava-hf/llava-1.5-7b-hf"
+
+        model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:"
+        image_file = "https://llava-vl.github.io/static/images/view.jpg"
+        raw_image = Image.open(requests.get(image_file, stream=True).raw)
+        inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
+
+        output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
+        EXPECTED_DECODED_TEXT = "USER:  \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the presence of wildlife, such as birds or fish, and avoid disturbing their natural habitats. Lastly, be aware of any local regulations or guidelines for the use of the pier, as some areas may be restricted or prohibited for certain activities."  # fmt: skip
+
+        self.assertEqual(
+            processor.decode(output[0], skip_special_tokens=True),
+            EXPECTED_DECODED_TEXT,
+        )
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_llama_batched(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model_id = "llava-hf/llava-1.5-7b-hf"
+
+        model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id, pad_token="<pad>")
+
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['USER:  \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: the water is calm and clear\n\nThe image shows a wooden pier on a lake, with a', 'USER:  \nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER:  \nAnd this?\nASSISTANT: A cat sleeping on a bed.']  # fmt: skip
+
+        self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test_batch(self):
+        # Let' s make sure we test the preprocessing to replace what is used
+        model = LlavaForConditionalGeneration.from_pretrained("llava-hf/bakLlava-v1-hf", load_in_4bit=True)
+        # The first batch is longer in terms of text, but only has 1 image. The second batch will be padded in text, but the first will be padded because images take more space!.
+        prompts = [
+            "USER: <image>\nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:",
+            "USER: <image>\nWhat is this?\nASSISTANT: Two cats lying on a bed!\nUSER: <image>\nAnd this?\nASSISTANT:",
+        ]
+        image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
+        image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+
+        inputs = self.processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
+
+        output = model.generate(**inputs, max_new_tokens=20)
+
+        EXPECTED_DECODED_TEXT = ['\nUSER: What are the things I should be cautious about when I visit this place? What should I bring with me\nASSISTANT: When visiting this place, bring a camera, Park rules may apply, Per person, Sunrise over', '\nUSER: What is this?\nASSISTANT: Two cats lying on a bed!\nUSER: And this? a dock on a lake with two cats on it (Photo credit: Jocelyn R.']  # fmt: skip
+        self.assertEqual(self.processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT)
diff --git a/tests/models/longformer/test_tokenization_longformer.py b/tests/models/longformer/test_tokenization_longformer.py
index 61d8653b60c608..32dc0f952fee55 100644
--- a/tests/models/longformer/test_tokenization_longformer.py
+++ b/tests/models/longformer/test_tokenization_longformer.py
@@ -28,7 +28,9 @@
 
 
 @require_tokenizers
+# Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest with roberta-base->allenai/longformer-base-4096,Roberta->Longformer,roberta->longformer,
 class LongformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    # Ignore copy
     tokenizer_class = LongformerTokenizer
     test_slow_tokenizer = True
     rust_tokenizer_class = LongformerTokenizerFast
@@ -71,23 +73,19 @@ def setUp(self):
         with open(self.merges_file, "w", encoding="utf-8") as fp:
             fp.write("\n".join(merges))
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_tokenizer
     def get_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return self.tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_rust_tokenizer
     def get_rust_tokenizer(self, **kwargs):
         kwargs.update(self.special_tokens_map)
         return self.rust_tokenizer_class.from_pretrained(self.tmpdirname, **kwargs)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.get_input_output_texts
     def get_input_output_texts(self, tokenizer):
         input_text = "lower newer"
         output_text = "lower newer"
         return input_text, output_text
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_full_tokenizer
     def test_full_tokenizer(self):
         tokenizer = self.tokenizer_class(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower newer"
@@ -99,7 +97,6 @@ def test_full_tokenizer(self):
         input_bpe_tokens = [0, 1, 2, 15, 10, 9, 3, 2, 15, 19]
         self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.roberta_dict_integration_testing with roberta->longformer
     def longformer_dict_integration_testing(self):
         tokenizer = self.get_tokenizer()
 
@@ -110,7 +107,6 @@ def longformer_dict_integration_testing(self):
         )
 
     @slow
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_sequence_builders with roberta-base->allenai/longformer-base-4096
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("allenai/longformer-base-4096")
 
@@ -130,7 +126,6 @@ def test_sequence_builders(self):
         assert encoded_sentence == encoded_text_from_decode
         assert encoded_pair == encoded_pair_from_decode
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_space_encoding
     def test_space_encoding(self):
         tokenizer = self.get_tokenizer()
 
@@ -171,11 +166,9 @@ def test_space_encoding(self):
         first_char = tokenizer.convert_ids_to_tokens(encoded[mask_loc + 1])[0]
         self.assertNotEqual(first_char, space_encoding)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_pretokenized_inputs
     def test_pretokenized_inputs(self):
         pass
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_embeded_special_tokens
     def test_embeded_special_tokens(self):
         for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
             with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -208,7 +201,6 @@ def test_embeded_special_tokens(self):
                     tokens_r_str, ["<s>", "A", ",", "<mask>", "ĠAllen", "N", "LP", "Ġsentence", ".", "</s>"]
                 )
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_change_add_prefix_space_and_trim_offsets_args
     def test_change_add_prefix_space_and_trim_offsets_args(self):
         for trim_offsets, add_prefix_space in itertools.product([True, False], repeat=2):
             tokenizer_r = self.rust_tokenizer_class.from_pretrained(
@@ -223,7 +215,6 @@ def test_change_add_prefix_space_and_trim_offsets_args(self):
             self.assertEqual(post_processor_state["add_prefix_space"], add_prefix_space)
             self.assertEqual(post_processor_state["trim_offsets"], trim_offsets)
 
-    # Copied from tests.models.roberta.test_tokenization_roberta.RobertaTokenizationTest.test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments
     def test_offsets_mapping_with_different_add_prefix_space_and_trim_space_arguments(self):
         # Test which aims to verify that the offsets are well adapted to the argument `add_prefix_space` and
         # `trim_offsets`
diff --git a/tests/models/mask2former/test_modeling_mask2former.py b/tests/models/mask2former/test_modeling_mask2former.py
index b2fc84e7d324f5..8dc70d7c750d1b 100644
--- a/tests/models/mask2former/test_modeling_mask2former.py
+++ b/tests/models/mask2former/test_modeling_mask2former.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Mask2Former model. """
 
-import inspect
 import unittest
 
 import numpy as np
@@ -242,18 +241,6 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["facebook/mask2former-swin-small-coco-instance"]:
diff --git a/tests/models/maskformer/test_modeling_maskformer.py b/tests/models/maskformer/test_modeling_maskformer.py
index fe1cc3423e0fbc..ffa77a051259e0 100644
--- a/tests/models/maskformer/test_modeling_maskformer.py
+++ b/tests/models/maskformer/test_modeling_maskformer.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MaskFormer model. """
 
 import copy
-import inspect
 import unittest
 
 import numpy as np
@@ -266,18 +265,6 @@ def test_resize_tokens_embeddings(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in ["facebook/maskformer-swin-small-coco"]:
diff --git a/tests/models/maskformer/test_modeling_maskformer_swin.py b/tests/models/maskformer/test_modeling_maskformer_swin.py
index 4125f36db798e1..8d29e8ebee096d 100644
--- a/tests/models/maskformer/test_modeling_maskformer_swin.py
+++ b/tests/models/maskformer/test_modeling_maskformer_swin.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MaskFormer Swin model. """
 
 import collections
-import inspect
 import unittest
 from typing import Dict, List, Tuple
 
@@ -234,18 +233,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @unittest.skip(reason="MaskFormerSwin is only used as backbone and doesn't support output_attentions")
     def test_attention_outputs(self):
         pass
diff --git a/tests/models/mgp_str/test_modeling_mgp_str.py b/tests/models/mgp_str/test_modeling_mgp_str.py
index d8ba50a3500288..a7fd95a1311c5c 100644
--- a/tests/models/mgp_str/test_modeling_mgp_str.py
+++ b/tests/models/mgp_str/test_modeling_mgp_str.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Testing suite for the PyTorch MGP-STR model. """
 
-import inspect
 import unittest
 
 import requests
@@ -151,18 +150,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @unittest.skip(reason="MgpstrModel does not support feedforward chunking")
     def test_feed_forward_chunking(self):
         pass
diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py
index b30e70ba71f92e..35a2341b4e69d6 100644
--- a/tests/models/mistral/test_modeling_mistral.py
+++ b/tests/models/mistral/test_modeling_mistral.py
@@ -24,6 +24,7 @@
 from transformers import AutoTokenizer, MistralConfig, is_torch_available
 from transformers.testing_utils import (
     backend_empty_cache,
+    require_bitsandbytes,
     require_flash_attn,
     require_torch,
     require_torch_gpu,
@@ -33,7 +34,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -106,7 +107,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
 
         token_type_ids = None
         if self.use_token_type_ids:
@@ -301,6 +302,12 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
     test_headmasking = False
     test_pruning = False
 
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     def setUp(self):
         self.model_tester = MistralModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MistralConfig, hidden_size=37)
@@ -375,17 +382,14 @@ def test_flash_attn_2_generate_padding_right(self):
         import torch
 
         for model_class in self.all_generative_model_classes:
-            if not model_class._supports_flash_attn_2:
-                return
-
             config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
-                ).to(torch_device)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+                    torch_device
+                )
 
                 dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
                 dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
@@ -393,7 +397,10 @@ def test_flash_attn_2_generate_padding_right(self):
                 model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
                 ).to(torch_device)
 
                 with self.assertRaises(ValueError):
@@ -405,36 +412,53 @@ def test_flash_attn_2_generate_padding_right(self):
     @require_torch_gpu
     @pytest.mark.flash_attn_test
     @slow
-    def test_flash_attn_2_inference_padding_right(self):
+    def test_flash_attn_2_generate_use_cache(self):
         import torch
 
-        for model_class in self.all_model_classes:
-            if not model_class._supports_flash_attn_2:
-                return
+        max_new_tokens = 30
+
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
 
-            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
-                model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
-                )
-                model_fa.to(torch_device)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                # NOTE: Mistral apparently does not support right padding + use_cache with FA2.
+                dummy_attention_mask[:, -1] = 1
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
-                )
-                model.to(torch_device)
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
 
-                dummy_input = torch.LongTensor([[1, 2, 3, 4, 5]]).to(torch_device)
-                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1, 0]]).to(torch_device)
+                # Just test that a large cache works as expected
+                _ = model.generate(
+                    dummy_input,
+                    attention_mask=dummy_attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=False,
+                    use_cache=True,
+                )
 
-                _ = model(dummy_input, output_hidden_states=True).hidden_states[-1]
-                with self.assertRaises(ValueError):
-                    _ = model_fa(
-                        dummy_input, attention_mask=dummy_attention_mask, output_hidden_states=True
-                    ).hidden_states[-1]
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_padding_right(self):
+        self.skipTest("Mistral flash attention does not support right padding")
 
 
 @require_torch
@@ -474,3 +498,32 @@ def test_model_7b_generation(self):
         del model
         backend_empty_cache(torch_device)
         gc.collect()
+
+    @require_bitsandbytes
+    @slow
+    @require_flash_attn
+    def test_model_7b_long_prompt(self):
+        EXPECTED_OUTPUT_TOKEN_IDS = [306, 338]
+        # An input with 4097 tokens that is above the size of the sliding window
+        input_ids = [1] + [306, 338] * 2048
+        model = MistralForCausalLM.from_pretrained(
+            "mistralai/Mistral-7B-v0.1",
+            device_map="auto",
+            load_in_4bit=True,
+            attn_implementation="flash_attention_2",
+        )
+        input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device)
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        # Assisted generation
+        assistant_model = model
+        assistant_model.generation_config.num_assistant_tokens = 2
+        assistant_model.generation_config.num_assistant_tokens_schedule = "constant"
+        generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0)
+        self.assertEqual(EXPECTED_OUTPUT_TOKEN_IDS, generated_ids[0][-2:].tolist())
+
+        del assistant_model
+        del model
+        backend_empty_cache(torch_device)
+        gc.collect()
diff --git a/tests/models/mixtral/__init__.py b/tests/models/mixtral/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py
new file mode 100644
index 00000000000000..a2d5af00237b74
--- /dev/null
+++ b/tests/models/mixtral/test_modeling_mixtral.py
@@ -0,0 +1,537 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch Mixtral model. """
+
+
+import tempfile
+import unittest
+
+import pytest
+
+from transformers import MixtralConfig, is_torch_available
+from transformers.testing_utils import (
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MixtralForCausalLM,
+        MixtralForSequenceClassification,
+        MixtralModel,
+    )
+
+
+class MixtralModelTester:
+    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.__init__
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        seq_length=7,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=False,
+        use_labels=True,
+        vocab_size=99,
+        hidden_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        num_key_value_heads=2,
+        intermediate_size=37,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=16,
+        type_sequence_label_size=2,
+        initializer_range=0.02,
+        num_labels=3,
+        num_choices=4,
+        pad_token_id=0,
+        scope=None,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.use_labels = use_labels
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.type_sequence_label_size = type_sequence_label_size
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.pad_token_id = pad_token_id
+        self.scope = scope
+
+    # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
+
+        token_type_ids = None
+        if self.use_token_type_ids:
+            token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+        sequence_labels = None
+        token_labels = None
+        choice_labels = None
+        if self.use_labels:
+            sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+            token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+            choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+        config = self.get_config()
+
+        return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+    def get_config(self):
+        return MixtralConfig(
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            num_key_value_heads=self.num_key_value_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            type_vocab_size=self.type_vocab_size,
+            is_decoder=False,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            num_experts_per_tok=2,
+            num_local_experts=2,
+        )
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->Mixtral
+    def create_and_check_model(
+        self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+    ):
+        model = MixtralModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask)
+        result = model(input_ids)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->Mixtral
+    def create_and_check_model_as_decoder(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.add_cross_attention = True
+        model = MixtralModel(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        result = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+        )
+        result = model(input_ids, attention_mask=input_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->Mixtral
+    def create_and_check_for_causal_lm(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        model = MixtralForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=input_mask, labels=token_labels)
+        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->Mixtral
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        token_type_ids,
+        input_mask,
+        sequence_labels,
+        token_labels,
+        choice_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        config.add_cross_attention = True
+        model = MixtralForCausalLM(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # first forward pass
+        outputs = model(
+            input_ids,
+            attention_mask=input_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=True,
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            next_input_ids,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+        output_from_past = model(
+            next_tokens,
+            attention_mask=next_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->Mixtral
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            token_type_ids,
+            input_mask,
+            sequence_labels,
+            token_labels,
+            choice_labels,
+        ) = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask}
+        return config, inputs_dict
+
+
+@require_torch
+# Copied from tests.models.mistral.test_modeling_mistral.MistralModelTest with Mistral->Mixtral
+class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (MixtralModel, MixtralForCausalLM, MixtralForSequenceClassification) if is_torch_available() else ()
+    )
+    all_generative_model_classes = (MixtralForCausalLM,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "feature-extraction": MixtralModel,
+            "text-classification": MixtralForSequenceClassification,
+            "text-generation": MixtralForCausalLM,
+            "zero-shot": MixtralForSequenceClassification,
+        }
+        if is_torch_available()
+        else {}
+    )
+    test_headmasking = False
+    test_pruning = False
+
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
+    def setUp(self):
+        self.model_tester = MixtralModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=MixtralConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_model_various_embeddings(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        for type in ["absolute", "relative_key", "relative_key_query"]:
+            config_and_inputs[0].position_embedding_type = type
+            self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_Mixtral_sequence_classification_model(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        print(config)
+        config.num_labels = 3
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = MixtralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mixtral_sequence_classification_model_for_single_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "single_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor([self.model_tester.batch_size], self.model_tester.type_sequence_label_size)
+        model = MixtralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    def test_Mixtral_sequence_classification_model_for_multi_label(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.problem_type = "multi_label_classification"
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        sequence_labels = ids_tensor(
+            [self.model_tester.batch_size, config.num_labels], self.model_tester.type_sequence_label_size
+        ).to(torch.float)
+        model = MixtralForSequenceClassification(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
+        self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
+
+    @unittest.skip("Mixtral buffers include complex numbers, which breaks this test")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip("Mixtral uses GQA on all models so the KV cache is a non standard format")
+    def test_past_key_values_format(self):
+        pass
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_padding_right(self):
+        import torch
+
+        for model_class in self.all_generative_model_classes:
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+                    torch_device
+                )
+
+                dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+                dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device)
+
+                model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False)
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                with self.assertRaises(ValueError):
+                    _ = model.generate(
+                        dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
+                    )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_generate_use_cache(self):
+        import torch
+
+        max_new_tokens = 30
+
+        for model_class in self.all_generative_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+                # NOTE: Mixtral apparently does not support right padding + use_cache with FA2.
+                dummy_attention_mask[:, -1] = 1
+
+                model = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                # Just test that a large cache works as expected
+                _ = model.generate(
+                    dummy_input,
+                    attention_mask=dummy_attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=False,
+                    use_cache=True,
+                )
+
+    @require_flash_attn
+    @require_torch_gpu
+    @pytest.mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_inference_padding_right(self):
+        self.skipTest("Mixtral flash attention does not support right padding")
+
+    # Ignore copy
+    def test_load_balancing_loss(self):
+        r"""
+        Let's make sure we can actually compute the loss and do a backward on it.
+        """
+
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.num_labels = 3
+        config.output_router_logits = True
+        input_ids = input_dict["input_ids"]
+        attention_mask = input_ids.ne(1).to(torch_device)
+        model = MixtralForCausalLM(config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, attention_mask=attention_mask)
+        self.assertEqual(result.router_logits[0].shape, (91, config.num_experts_per_tok))
+        torch.testing.assert_close(result.aux_loss.cpu(), torch.tensor(1, dtype=torch.float32))
+
+
+@require_torch
+class MixtralIntegrationTest(unittest.TestCase):
+    @slow
+    @require_torch_gpu
+    def test_small_model_logits(self):
+        model_id = "hf-internal-testing/Mixtral-tiny"
+        dummy_input = torch.LongTensor([[0, 1, 0], [0, 1, 0]]).to(torch_device)
+
+        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
+            torch_device
+        )
+        # TODO: might need to tweak it in case the logits do not match on our daily runners
+        # these logits have been obtained with the original megablocks impelmentation.
+        EXPECTED_LOGITS = torch.Tensor(
+            [[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]
+        ).to(torch_device)
+
+        with torch.no_grad():
+            logits = model(dummy_input).logits
+
+        torch.testing.assert_close(logits[0, :3, :3].half(), EXPECTED_LOGITS, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(logits[1, :3, :3].half(), EXPECTED_LOGITS, atol=1e-3, rtol=1e-3)
+
+    @slow
+    # @require_torch_gpu
+    def test_small_model_logits_batched(self):
+        model_id = "hf-internal-testing/Mixtral-tiny"
+        dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device)
+        attention_mask = dummy_input.ne(0).to(torch.long)
+
+        model = MixtralForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True).to(
+            torch_device
+        )
+
+        # TODO: might need to tweak it in case the logits do not match on our daily runners
+        EXPECTED_LOGITS_LEFT = torch.Tensor(
+            [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]],
+        )
+
+        # logits[0, -3:, -3:].half()
+        EXPECTED_LOGITS_LEFT_UNPADDED = torch.Tensor(
+            [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]],
+        )
+
+        # logits[1, -3:, -3:].half()
+        EXPECTED_LOGITS_RIGHT_UNPADDED = torch.Tensor(
+            [[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]]
+        )
+
+        with torch.no_grad():
+            logits = model(dummy_input, attention_mask=attention_mask).logits
+
+        torch.testing.assert_close(logits[0, :3, :3].half(), EXPECTED_LOGITS_LEFT, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(logits[0, -3:, -3:].half(), EXPECTED_LOGITS_LEFT_UNPADDED, atol=1e-3, rtol=1e-3)
+        torch.testing.assert_close(logits[1, -3:, -3:].half(), EXPECTED_LOGITS_RIGHT_UNPADDED, atol=1e-3, rtol=1e-3)
diff --git a/tests/models/mobilebert/test_modeling_mobilebert.py b/tests/models/mobilebert/test_modeling_mobilebert.py
index a914ce578dc88c..e4ebca4b6e5b64 100644
--- a/tests/models/mobilebert/test_modeling_mobilebert.py
+++ b/tests/models/mobilebert/test_modeling_mobilebert.py
@@ -302,10 +302,6 @@ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
     def test_resize_tokens_embeddings(self):
         super().test_resize_tokens_embeddings()
 
-    @unittest.skip("This test is currently broken because of safetensors.")
-    def test_tf_from_pt_safetensors(self):
-        pass
-
     def setUp(self):
         self.model_tester = MobileBertModelTester(self)
         self.config_tester = ConfigTester(self, config_class=MobileBertConfig, hidden_size=37)
diff --git a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
index 8c24935800b900..35848da3161d51 100644
--- a/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
+++ b/tests/models/mobilenet_v1/test_modeling_mobilenet_v1.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MobileNetV1 model. """
 
 
-import inspect
 import unittest
 
 from transformers import MobileNetV1Config
@@ -177,18 +176,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
index 06b2bd9d3fc466..bbd83408853ceb 100644
--- a/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
+++ b/tests/models/mobilenet_v2/test_modeling_mobilenet_v2.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MobileNetV2 model. """
 
 
-import inspect
 import unittest
 
 from transformers import MobileNetV2Config
@@ -228,18 +227,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilevit/test_modeling_mobilevit.py b/tests/models/mobilevit/test_modeling_mobilevit.py
index 2c01ea0c99bb4e..563bee802322d0 100644
--- a/tests/models/mobilevit/test_modeling_mobilevit.py
+++ b/tests/models/mobilevit/test_modeling_mobilevit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MobileViT model. """
 
 
-import inspect
 import unittest
 
 from transformers import MobileViTConfig
@@ -221,18 +220,6 @@ def test_model_common_attributes(self):
     def test_attention_outputs(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
index b1961b2e6d4a3c..192cf3a9e1e896 100644
--- a/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
+++ b/tests/models/mobilevitv2/test_modeling_mobilevitv2.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch MobileViTV2 model. """
 
 
-import inspect
 import unittest
 
 from transformers import MobileViTV2Config
@@ -228,18 +227,6 @@ def test_attention_outputs(self):
     def test_multi_gpu_data_parallel_forward(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/mpnet/test_modeling_mpnet.py b/tests/models/mpnet/test_modeling_mpnet.py
index 52d8d1f8b4c68c..10c0c164d16f30 100644
--- a/tests/models/mpnet/test_modeling_mpnet.py
+++ b/tests/models/mpnet/test_modeling_mpnet.py
@@ -246,7 +246,7 @@ def test_for_question_answering(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_mpnet_for_question_answering(*config_and_inputs)
 
-    @unittest.skip("This isn't passing but should, seems like a misconfiguration of tied weights.")
+    @unittest.skip("TFMPNet adds poolers to all models, unlike the PT model class.")
     def test_tf_from_pt_safetensors(self):
         return
 
diff --git a/tests/models/nat/test_modeling_nat.py b/tests/models/nat/test_modeling_nat.py
index a27b087ce51904..3ab49d2d9557fa 100644
--- a/tests/models/nat/test_modeling_nat.py
+++ b/tests/models/nat/test_modeling_nat.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Nat model. """
 
 import collections
-import inspect
 import unittest
 
 from transformers import NatConfig
@@ -261,18 +260,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         self.skipTest("Nat's attention operation is handled entirely by NATTEN.")
 
diff --git a/tests/models/nllb_moe/test_modeling_nllb_moe.py b/tests/models/nllb_moe/test_modeling_nllb_moe.py
index 1109948e0e7092..2e8ba30ce675a6 100644
--- a/tests/models/nllb_moe/test_modeling_nllb_moe.py
+++ b/tests/models/nllb_moe/test_modeling_nllb_moe.py
@@ -348,10 +348,6 @@ def test_get_loss(self):
         self.assertIsNotNone(model(**input_dict)["encoder_router_logits"][1])
         self.assertIsNotNone(model(**input_dict)["decoder_router_logits"][0])
 
-    @unittest.skip("Test does not fail individually but fails on the CI @ArthurZucker looking into it")
-    def test_assisted_decoding_sample(self):
-        pass
-
 
 @require_torch
 @require_sentencepiece
diff --git a/tests/models/owlv2/test_modeling_owlv2.py b/tests/models/owlv2/test_modeling_owlv2.py
index 51d9e537840944..8dbf3fcde89bbc 100644
--- a/tests/models/owlv2/test_modeling_owlv2.py
+++ b/tests/models/owlv2/test_modeling_owlv2.py
@@ -840,10 +840,12 @@ def test_inference_object_detection(self):
         num_queries = int((model.config.vision_config.image_size / model.config.vision_config.patch_size) ** 2)
         self.assertEqual(outputs.pred_boxes.shape, torch.Size((1, num_queries, 4)))
 
-        expected_slice_logits = torch.tensor([[-21.4139, -21.6130], [-19.0084, -19.5491], [-20.9592, -21.3830]])
+        expected_slice_logits = torch.tensor(
+            [[-21.413497, -21.612638], [-19.008193, -19.548841], [-20.958896, -21.382694]]
+        ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_slice_logits, atol=1e-4))
         expected_slice_boxes = torch.tensor(
-            [[0.2413, 0.0519, 0.4533], [0.1395, 0.0457, 0.2507], [0.2330, 0.0505, 0.4277]],
+            [[0.241309, 0.051896, 0.453267], [0.139474, 0.045701, 0.250660], [0.233022, 0.050479, 0.427671]],
         ).to(torch_device)
         self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_slice_boxes, atol=1e-4))
 
diff --git a/tests/models/patchtsmixer/__init__.py b/tests/models/patchtsmixer/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/patchtsmixer/test_modeling_patchtsmixer.py b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
new file mode 100644
index 00000000000000..4f91dc0301c2af
--- /dev/null
+++ b/tests/models/patchtsmixer/test_modeling_patchtsmixer.py
@@ -0,0 +1,1094 @@
+# coding=utf-8
+# Copyright 2023 IBM and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch PatchTSMixer model. """
+
+import inspect
+import itertools
+import random
+import tempfile
+import unittest
+from typing import Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import hf_hub_download
+from parameterized import parameterized
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
+        PatchTSMixerConfig,
+        PatchTSMixerForPrediction,
+        PatchTSMixerForPretraining,
+        PatchTSMixerForRegression,
+        PatchTSMixerForTimeSeriesClassification,
+        PatchTSMixerModel,
+    )
+    from transformers.models.patchtsmixer.modeling_patchtsmixer import (
+        PatchTSMixerEncoder,
+        PatchTSMixerForPredictionHead,
+        PatchTSMixerForPredictionOutput,
+        PatchTSMixerForRegressionOutput,
+        PatchTSMixerForTimeSeriesClassificationOutput,
+        PatchTSMixerLinearHead,
+        PatchTSMixerPretrainHead,
+    )
+
+
+@require_torch
+class PatchTSMixerModelTester:
+    def __init__(
+        self,
+        context_length: int = 32,
+        patch_length: int = 8,
+        num_input_channels: int = 3,
+        patch_stride: int = 8,
+        # d_model: int = 128,
+        hidden_size: int = 8,
+        # num_layers: int = 8,
+        num_hidden_layers: int = 2,
+        expansion_factor: int = 2,
+        dropout: float = 0.5,
+        mode: str = "common_channel",
+        gated_attn: bool = True,
+        norm_mlp="LayerNorm",
+        swin_hier: int = 0,
+        # masking related
+        mask_type: str = "forecast",
+        random_mask_ratio=0.5,
+        mask_patches: list = [2, 3],
+        forecast_mask_ratios: list = [1, 1],
+        mask_value=0,
+        masked_loss: bool = False,
+        mask_mode: str = "mask_before_encoder",
+        channel_consistent_masking: bool = True,
+        scaling: Optional[Union[str, bool]] = "std",
+        # Head related
+        head_dropout: float = 0.2,
+        # forecast related
+        prediction_length: int = 16,
+        out_channels: int = None,
+        # Classification/regression related
+        # num_labels: int = 3,
+        num_targets: int = 3,
+        output_range: list = None,
+        head_aggregation: str = None,
+        # Trainer related
+        batch_size=13,
+        is_training=True,
+        seed_number=42,
+        post_init=True,
+        num_parallel_samples=4,
+    ):
+        self.num_input_channels = num_input_channels
+        self.context_length = context_length
+        self.patch_length = patch_length
+        self.patch_stride = patch_stride
+        # self.d_model = d_model
+        self.hidden_size = hidden_size
+        self.expansion_factor = expansion_factor
+        # self.num_layers = num_layers
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.mode = mode
+        self.gated_attn = gated_attn
+        self.norm_mlp = norm_mlp
+        self.swin_hier = swin_hier
+        self.scaling = scaling
+        self.head_dropout = head_dropout
+        # masking related
+        self.mask_type = mask_type
+        self.random_mask_ratio = random_mask_ratio
+        self.mask_patches = mask_patches
+        self.forecast_mask_ratios = forecast_mask_ratios
+        self.mask_value = mask_value
+        self.channel_consistent_masking = channel_consistent_masking
+        self.mask_mode = mask_mode
+        self.masked_loss = masked_loss
+        # patching related
+        self.patch_last = True
+        # forecast related
+        self.prediction_length = prediction_length
+        self.out_channels = out_channels
+        # classification/regression related
+        # self.num_labels = num_labels
+        self.num_targets = num_targets
+        self.output_range = output_range
+        self.head_aggregation = head_aggregation
+        # Trainer related
+        self.batch_size = batch_size
+        self.is_training = is_training
+        self.seed_number = seed_number
+        self.post_init = post_init
+        self.num_parallel_samples = num_parallel_samples
+
+    def get_config(self):
+        config_ = PatchTSMixerConfig(
+            num_input_channels=self.num_input_channels,
+            context_length=self.context_length,
+            patch_length=self.patch_length,
+            patch_stride=self.patch_stride,
+            # d_model = self.d_model,
+            d_model=self.hidden_size,
+            expansion_factor=self.expansion_factor,
+            # num_layers = self.num_layers,
+            num_layers=self.num_hidden_layers,
+            dropout=self.dropout,
+            mode=self.mode,
+            gated_attn=self.gated_attn,
+            norm_mlp=self.norm_mlp,
+            swin_hier=self.swin_hier,
+            scaling=self.scaling,
+            head_dropout=self.head_dropout,
+            mask_type=self.mask_type,
+            random_mask_ratio=self.random_mask_ratio,
+            mask_patches=self.mask_patches,
+            forecast_mask_ratios=self.forecast_mask_ratios,
+            mask_value=self.mask_value,
+            channel_consistent_masking=self.channel_consistent_masking,
+            mask_mode=self.mask_mode,
+            masked_loss=self.masked_loss,
+            prediction_length=self.prediction_length,
+            out_channels=self.out_channels,
+            # num_labels=self.num_labels,
+            num_targets=self.num_targets,
+            output_range=self.output_range,
+            head_aggregation=self.head_aggregation,
+            post_init=self.post_init,
+        )
+        self.num_patches = config_.num_patches
+        return config_
+
+    def prepare_patchtsmixer_inputs_dict(self, config):
+        _past_length = config.context_length
+        # bs, n_vars, num_patch, patch_length
+
+        # [bs x context_length x n_vars]
+        past_values = floats_tensor([self.batch_size, _past_length, self.num_input_channels])
+
+        future_values = floats_tensor([self.batch_size, config.prediction_length, self.num_input_channels])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_patchtsmixer_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class PatchTSMixerModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            PatchTSMixerModel,
+            PatchTSMixerForPrediction,
+            PatchTSMixerForPretraining,
+            PatchTSMixerForTimeSeriesClassification,
+            PatchTSMixerForRegression,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (
+        (PatchTSMixerForPrediction, PatchTSMixerForPretraining) if is_torch_available() else ()
+    )
+    pipeline_model_mapping = {"feature-extraction": PatchTSMixerModel} if is_torch_available() else {}
+    is_encoder_decoder = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = False
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    test_resize_embeddings = True
+    test_resize_position_embeddings = False
+    test_mismatched_shapes = True
+    test_model_parallel = False
+    has_attentions = False
+
+    def setUp(self):
+        self.model_tester = PatchTSMixerModelTester()
+        self.config_tester = ConfigTester(
+            self,
+            config_class=PatchTSMixerConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
+            common_properties=["hidden_size", "expansion_factor", "num_hidden_layers"],
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        # if classification model:
+        if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
+            rng = random.Random(self.model_tester.seed_number)
+            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng)
+            # inputs_dict["labels"] = labels
+            inputs_dict["future_values"] = labels
+            # inputs_dict.pop("future_values")
+        elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+            rng = random.Random(self.model_tester.seed_number)
+            labels = floats_tensor([self.model_tester.batch_size, self.model_tester.num_targets], rng=rng)
+            # inputs_dict["labels"] = labels
+            inputs_dict["future_values"] = labels
+            # inputs_dict.pop("future_values")
+        elif model_class in [PatchTSMixerModel, PatchTSMixerForPretraining]:
+            inputs_dict.pop("future_values")
+
+        inputs_dict["output_hidden_states"] = True
+        return inputs_dict
+
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester,
+                "expected_num_hidden_layers",
+                self.model_tester.num_hidden_layers,
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            expected_hidden_size = self.model_tester.hidden_size
+            self.assertEqual(hidden_states[0].shape[-1], expected_hidden_size)
+
+            num_patch = self.model_tester.num_patches
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [num_patch, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip("No tokens embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_model_outputs_equivalence(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        def set_nan_tensor_to_zero(t):
+            t[t != t] = 0
+            return t
+
+        def check_equivalence(model, tuple_inputs, dict_inputs, additional_kwargs={}):
+            with torch.no_grad():
+                tuple_output = model(**tuple_inputs, return_dict=False, **additional_kwargs)
+                output_ = model(**dict_inputs, return_dict=True, **additional_kwargs)
+                attributes_ = vars(output_)
+                dict_output = tuple(attributes_.values())
+
+                def recursive_check(tuple_object, dict_object):
+                    if isinstance(tuple_object, (List, Tuple)):
+                        for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif isinstance(tuple_object, Dict):
+                        for tuple_iterable_value, dict_iterable_value in zip(
+                            tuple_object.values(), dict_object.values()
+                        ):
+                            recursive_check(tuple_iterable_value, dict_iterable_value)
+                    elif tuple_object is None:
+                        return
+                    else:
+                        self.assertTrue(
+                            torch.allclose(
+                                set_nan_tensor_to_zero(tuple_object),
+                                set_nan_tensor_to_zero(dict_object),
+                                atol=1e-5,
+                            ),
+                            msg=(
+                                "Tuple and dict output are not equal. Difference:"
+                                f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
+                                f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
+                                f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
+                            ),
+                        )
+
+                recursive_check(tuple_output, dict_output)
+
+        for model_class in self.all_model_classes:
+            print(model_class)
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class)
+            tuple_inputs.update({"output_hidden_states": False})
+            dict_inputs.update({"output_hidden_states": False})
+            check_equivalence(model, tuple_inputs, dict_inputs)
+
+            tuple_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            dict_inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
+            tuple_inputs.update({"output_hidden_states": False})
+            dict_inputs.update({"output_hidden_states": False})
+            check_equivalence(
+                model,
+                tuple_inputs,
+                dict_inputs,
+            )
+
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(PatchTSMixerModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(PatchTSMixerModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names_with_target = [
+                "past_values",
+                "observed_mask",
+                "future_values",
+                "output_hidden_states",
+                "return_loss",
+            ]
+            expected_arg_names_without_target = [
+                "past_values",
+                "observed_mask",
+                "output_hidden_states",
+            ]
+
+            expected_arg_names = expected_arg_names_with_target
+            if model_class == PatchTSMixerForPretraining:
+                expected_arg_names = expected_arg_names_without_target + ["return_loss"]
+            if model_class == PatchTSMixerModel:
+                expected_arg_names = expected_arg_names_without_target
+            if model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
+                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+            ):
+                expected_arg_names.remove("observed_mask")
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(repo_id="ibm/patchtsmixer-etth1-test-data", file="pretrain_batch.pt"):
+    # TODO: Make repo public
+    file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+@require_torch
+@slow
+class PatchTSMixerModelIntegrationTests(unittest.TestCase):
+    def test_pretrain_head(self):
+        model = PatchTSMixerForPretraining.from_pretrained("ibm/patchtsmixer-etth1-pretrain").to(torch_device)
+        batch = prepare_batch()
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(past_values=batch["past_values"].to(torch_device)).prediction_outputs
+        num_patch = (
+            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
+        ) // model.config.patch_stride + 1
+        expected_shape = torch.Size(
+            [
+                32,
+                model.config.num_input_channels,
+                num_patch,
+                model.config.patch_length,
+            ]
+        )
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor([[[[0.1870]],[[-1.5819]],[[-0.0991]],[[-1.2609]],[[0.5633]],[[-0.5723]],[[0.3387]],]],device=torch_device)  # fmt: skip
+        self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
+
+    def test_forecasting_head(self):
+        model = PatchTSMixerForPrediction.from_pretrained("ibm/patchtsmixer-etth1-forecasting").to(torch_device)
+        batch = prepare_batch(file="forecast_batch.pt")
+
+        model.eval()
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"].to(torch_device),
+                future_values=batch["future_values"].to(torch_device),
+            ).prediction_outputs
+
+        expected_shape = torch.Size([32, model.config.prediction_length, model.config.num_input_channels])
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4271, -0.0651, 0.4656, 0.7104, -0.3085, -1.9658, 0.4560]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
+
+    def test_prediction_generation(self):
+        torch_device = "cpu"
+        model = PatchTSMixerForPrediction.from_pretrained("ibm/patchtsmixer-etth1-generate").to(torch_device)
+        batch = prepare_batch(file="forecast_batch.pt")
+        print(batch["past_values"])
+
+        model.eval()
+        torch.manual_seed(0)
+        with torch.no_grad():
+            outputs = model.generate(past_values=batch["past_values"].to(torch_device))
+        expected_shape = torch.Size((32, 1, model.config.prediction_length, model.config.num_input_channels))
+
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.0091, -0.3625, -0.0887, 0.6544, -0.4100, -2.3124, 0.3376]],
+            device=torch_device,
+        )
+        mean_prediction = outputs.sequences.mean(dim=1)
+
+        self.assertTrue(torch.allclose(mean_prediction[0, -1:], expected_slice, atol=TOLERANCE))
+
+
+@require_torch
+class PatchTSMixerFunctionalTests(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        """Setup method: Called once before test-cases execution"""
+        cls.params = {}
+        cls.params.update(
+            context_length=32,
+            patch_length=8,
+            num_input_channels=3,
+            patch_stride=8,
+            d_model=4,
+            expansion_factor=2,
+            num_layers=3,
+            dropout=0.2,
+            mode="common_channel",  # common_channel,  mix_channel
+            gated_attn=True,
+            norm_mlp="LayerNorm",
+            mask_type="random",
+            random_mask_ratio=0.5,
+            mask_patches=[2, 3],
+            forecast_mask_ratios=[1, 1],
+            mask_value=0,
+            masked_loss=True,
+            channel_consistent_masking=True,
+            head_dropout=0.2,
+            prediction_length=64,
+            out_channels=None,
+            # num_labels=3,
+            num_targets=3,
+            output_range=None,
+            head_aggregation=None,
+            scaling="std",
+            use_positional_encoding=False,
+            positional_encoding="sincos",
+            self_attn=False,
+            self_attn_heads=1,
+            num_parallel_samples=4,
+        )
+
+        cls.num_patches = (
+            max(cls.params["context_length"], cls.params["patch_length"]) - cls.params["patch_length"]
+        ) // cls.params["patch_stride"] + 1
+
+        # batch_size = 32
+        batch_size = 2
+
+        int(cls.params["prediction_length"] / cls.params["patch_length"])
+
+        cls.data = torch.rand(
+            batch_size,
+            cls.params["context_length"],
+            cls.params["num_input_channels"],
+        )
+
+        cls.enc_data = torch.rand(
+            batch_size,
+            cls.params["num_input_channels"],
+            cls.num_patches,
+            cls.params["patch_length"],
+        )
+
+        cls.enc_output = torch.rand(
+            batch_size,
+            cls.params["num_input_channels"],
+            cls.num_patches,
+            cls.params["d_model"],
+        )
+
+        cls.flat_enc_output = torch.rand(
+            batch_size,
+            cls.num_patches,
+            cls.params["d_model"],
+        )
+
+        cls.correct_pred_output = torch.rand(
+            batch_size,
+            cls.params["prediction_length"],
+            cls.params["num_input_channels"],
+        )
+        cls.correct_regression_output = torch.rand(batch_size, cls.params["num_targets"])
+
+        cls.correct_pretrain_output = torch.rand(
+            batch_size,
+            cls.params["num_input_channels"],
+            cls.num_patches,
+            cls.params["patch_length"],
+        )
+
+        cls.correct_forecast_output = torch.rand(
+            batch_size,
+            cls.params["prediction_length"],
+            cls.params["num_input_channels"],
+        )
+
+        cls.correct_sel_forecast_output = torch.rand(batch_size, cls.params["prediction_length"], 2)
+
+        cls.correct_classification_output = torch.rand(
+            batch_size,
+            cls.params["num_targets"],
+        )
+
+        cls.correct_classification_classes = torch.randint(0, cls.params["num_targets"], (batch_size,))
+
+    def test_patchtsmixer_encoder(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        enc = PatchTSMixerEncoder(config)
+        output = enc(self.__class__.enc_data)
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+
+    def test_patchmodel(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerModel(config)
+        output = mdl(self.__class__.data)
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.patch_input.shape, self.__class__.enc_data.shape)
+
+    def test_pretrainhead(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        head = PatchTSMixerPretrainHead(
+            config=config,
+        )
+        output = head(self.__class__.enc_output)
+
+        self.assertEqual(output.shape, self.__class__.correct_pretrain_output.shape)
+
+    def test_pretrain_full(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForPretraining(config)
+        output = mdl(self.__class__.data)
+        self.assertEqual(
+            output.prediction_outputs.shape,
+            self.__class__.correct_pretrain_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+    def test_pretrain_full_with_return_dict(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForPretraining(config)
+        output = mdl(self.__class__.data, return_dict=False)
+        self.assertEqual(output[1].shape, self.__class__.correct_pretrain_output.shape)
+        self.assertEqual(output[2].shape, self.__class__.enc_output.shape)
+        self.assertEqual(output[0].item() < 100, True)
+
+    def test_forecast_head(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        head = PatchTSMixerForPredictionHead(
+            config=config,
+        )
+        # output = head(self.__class__.enc_output, raw_data = self.__class__.correct_pretrain_output)
+        output = head(self.__class__.enc_output)
+
+        self.assertEqual(output.shape, self.__class__.correct_forecast_output.shape)
+
+    def check_module(
+        self,
+        task,
+        params=None,
+        output_hidden_states=True,
+    ):
+        config = PatchTSMixerConfig(**params)
+        if task == "forecast":
+            mdl = PatchTSMixerForPrediction(config)
+            target_input = self.__class__.correct_forecast_output
+            if config.prediction_channel_indices is not None:
+                target_output = self.__class__.correct_sel_forecast_output
+            else:
+                target_output = target_input
+            ref_samples = target_output.unsqueeze(1).expand(-1, config.num_parallel_samples, -1, -1)
+
+        elif task == "classification":
+            mdl = PatchTSMixerForTimeSeriesClassification(config)
+            target_input = self.__class__.correct_classification_classes
+            target_output = self.__class__.correct_classification_output
+        elif task == "regression":
+            mdl = PatchTSMixerForRegression(config)
+            target_input = self.__class__.correct_regression_output
+            target_output = self.__class__.correct_regression_output
+            ref_samples = target_output.unsqueeze(1).expand(-1, config.num_parallel_samples, -1)
+        elif task == "pretrain":
+            mdl = PatchTSMixerForPretraining(config)
+            target_input = None
+            target_output = self.__class__.correct_pretrain_output
+        else:
+            print("invalid task")
+
+        enc_output = self.__class__.enc_output
+
+        if target_input is None:
+            output = mdl(self.__class__.data, output_hidden_states=output_hidden_states)
+        else:
+            output = mdl(
+                self.__class__.data,
+                future_values=target_input,
+                output_hidden_states=output_hidden_states,
+            )
+
+        if isinstance(output.prediction_outputs, tuple):
+            for t in output.prediction_outputs:
+                self.assertEqual(t.shape, target_output.shape)
+        else:
+            self.assertEqual(output.prediction_outputs.shape, target_output.shape)
+
+        self.assertEqual(output.last_hidden_state.shape, enc_output.shape)
+
+        if output_hidden_states is True:
+            self.assertEqual(len(output.hidden_states), params["num_layers"])
+
+        else:
+            self.assertEqual(output.hidden_states, None)
+
+        self.assertEqual(output.loss.item() < 100, True)
+
+        if config.loss == "nll" and task in ["forecast", "regression"]:
+            samples = mdl.generate(self.__class__.data)
+            self.assertEqual(samples.sequences.shape, ref_samples.shape)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["common_channel", "mix_channel"],
+                [True, False],
+                [True, False, "mean", "std"],
+                [True, False],
+                [None, [0, 2]],
+                ["mse", "nll"],
+            )
+        )
+    )
+    def test_forecast(self, mode, self_attn, scaling, gated_attn, prediction_channel_indices, loss):
+        params = self.__class__.params.copy()
+        params.update(
+            mode=mode,
+            self_attn=self_attn,
+            scaling=scaling,
+            prediction_channel_indices=prediction_channel_indices,
+            gated_attn=gated_attn,
+            loss=loss,
+        )
+
+        self.check_module(task="forecast", params=params)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["common_channel", "mix_channel"],
+                [True, False],
+                [True, False, "mean", "std"],
+                [True, False],
+                ["max_pool", "avg_pool"],
+            )
+        )
+    )
+    def test_classification(self, mode, self_attn, scaling, gated_attn, head_aggregation):
+        params = self.__class__.params.copy()
+        params.update(
+            mode=mode,
+            self_attn=self_attn,
+            scaling=scaling,
+            head_aggregation=head_aggregation,
+            gated_attn=gated_attn,
+        )
+
+        self.check_module(task="classification", params=params)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["common_channel", "mix_channel"],
+                [True, False],
+                [True, False, "mean", "std"],
+                [True, False],
+                ["max_pool", "avg_pool"],
+                ["mse", "nll"],
+            )
+        )
+    )
+    def test_regression(self, mode, self_attn, scaling, gated_attn, head_aggregation, loss):
+        params = self.__class__.params.copy()
+        params.update(
+            mode=mode,
+            self_attn=self_attn,
+            scaling=scaling,
+            head_aggregation=head_aggregation,
+            gated_attn=gated_attn,
+            loss=loss,
+        )
+
+        self.check_module(task="regression", params=params)
+
+    @parameterized.expand(
+        list(
+            itertools.product(
+                ["common_channel", "mix_channel"],
+                [True, False],
+                [True, False, "mean", "std"],
+                [True, False],
+                ["random", "forecast"],
+                [True, False],
+                [True, False],
+            )
+        )
+    )
+    def test_pretrain(
+        self,
+        mode,
+        self_attn,
+        scaling,
+        gated_attn,
+        mask_type,
+        masked_loss,
+        channel_consistent_masking,
+    ):
+        params = self.__class__.params.copy()
+        params.update(
+            mode=mode,
+            self_attn=self_attn,
+            scaling=scaling,
+            gated_attn=gated_attn,
+            mask_type=mask_type,
+            masked_loss=masked_loss,
+            channel_consistent_masking=channel_consistent_masking,
+        )
+
+        self.check_module(task="pretrain", params=params)
+
+    def forecast_full_module(self, params=None, output_hidden_states=False, return_dict=None):
+        config = PatchTSMixerConfig(**params)
+        mdl = PatchTSMixerForPrediction(config)
+
+        target_val = self.__class__.correct_forecast_output
+
+        if config.prediction_channel_indices is not None:
+            target_val = self.__class__.correct_sel_forecast_output
+
+        enc_output = self.__class__.enc_output
+
+        output = mdl(
+            self.__class__.data,
+            future_values=self.__class__.correct_forecast_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if isinstance(output, tuple):
+            output = PatchTSMixerForPredictionOutput(*output)
+
+        if config.loss == "mse":
+            self.assertEqual(output.prediction_outputs.shape, target_val.shape)
+
+        self.assertEqual(output.last_hidden_state.shape, enc_output.shape)
+
+        if output_hidden_states is True:
+            self.assertEqual(len(output.hidden_states), params["num_layers"])
+
+        else:
+            self.assertEqual(output.hidden_states, None)
+
+        self.assertEqual(output.loss.item() < 100, True)
+
+        if config.loss == "nll":
+            samples = mdl.generate(self.__class__.data)
+            ref_samples = target_val.unsqueeze(1).expand(-1, params["num_parallel_samples"], -1, -1)
+            self.assertEqual(samples.sequences.shape, ref_samples.shape)
+
+    def test_forecast_full(self):
+        self.check_module(task="forecast", params=self.__class__.params, output_hidden_states=True)
+        # self.forecast_full_module(self.__class__.params, output_hidden_states = True)
+
+    def test_forecast_full_2(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+        )
+        self.forecast_full_module(params, output_hidden_states=True)
+
+    def test_forecast_full_2_with_return_dict(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+        )
+        self.forecast_full_module(params, output_hidden_states=True, return_dict=False)
+
+    def test_forecast_full_3(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+        )
+        self.forecast_full_module(params, output_hidden_states=True)
+
+    def test_forecast_full_5(self):
+        params = self.__class__.params.copy()
+        params.update(
+            self_attn=True,
+            use_positional_encoding=True,
+            positional_encoding="sincos",
+        )
+        self.forecast_full_module(params, output_hidden_states=True)
+
+    def test_forecast_full_4(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+            prediction_channel_indices=[0, 2],
+        )
+        self.forecast_full_module(params)
+
+    def test_forecast_full_distributional(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+            prediction_channel_indices=[0, 2],
+            loss="nll",
+            distribution_output="normal",
+        )
+
+        self.forecast_full_module(params)
+
+    def test_forecast_full_distributional_2(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+            prediction_channel_indices=[0, 2],
+            loss="nll",
+            # distribution_output = "normal",
+        )
+        self.forecast_full_module(params)
+
+    def test_forecast_full_distributional_3(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+            # prediction_channel_indices=[0, 2],
+            loss="nll",
+            distribution_output="normal",
+        )
+        self.forecast_full_module(params)
+
+    def test_forecast_full_distributional_4(self):
+        params = self.__class__.params.copy()
+        params.update(
+            mode="mix_channel",
+            # prediction_channel_indices=[0, 2],
+            loss="nll",
+            distribution_output="normal",
+        )
+        self.forecast_full_module(params)
+
+    def test_classification_head(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        head = PatchTSMixerLinearHead(
+            config=config,
+        )
+        # output = head(self.__class__.enc_output, raw_data = self.__class__.correct_pretrain_output)
+        output = head(self.__class__.enc_output)
+
+        self.assertEqual(output.shape, self.__class__.correct_classification_output.shape)
+
+    def test_classification_full(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForTimeSeriesClassification(config)
+        output = mdl(
+            self.__class__.data,
+            future_values=self.__class__.correct_classification_classes,
+        )
+        self.assertEqual(
+            output.prediction_outputs.shape,
+            self.__class__.correct_classification_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+    def test_classification_full_with_return_dict(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForTimeSeriesClassification(config)
+        output = mdl(
+            self.__class__.data,
+            future_values=self.__class__.correct_classification_classes,
+            return_dict=False,
+        )
+        if isinstance(output, tuple):
+            output = PatchTSMixerForTimeSeriesClassificationOutput(*output)
+        self.assertEqual(
+            output.prediction_outputs.shape,
+            self.__class__.correct_classification_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+    def test_regression_head(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        head = PatchTSMixerLinearHead(
+            config=config,
+        )
+        output = head(self.__class__.enc_output)
+        self.assertEqual(output.shape, self.__class__.correct_regression_output.shape)
+
+    def test_regression_full(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForRegression(config)
+        output = mdl(self.__class__.data, future_values=self.__class__.correct_regression_output)
+        self.assertEqual(
+            output.prediction_outputs.shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+    def test_regression_full_with_return_dict(self):
+        config = PatchTSMixerConfig(**self.__class__.params)
+        mdl = PatchTSMixerForRegression(config)
+        output = mdl(
+            self.__class__.data,
+            future_values=self.__class__.correct_regression_output,
+            return_dict=False,
+        )
+        if isinstance(output, tuple):
+            output = PatchTSMixerForRegressionOutput(*output)
+        self.assertEqual(
+            output.prediction_outputs.shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+    def test_regression_full_distribute(self):
+        params = self.__class__.params.copy()
+        params.update(loss="nll", distribution_output="normal")
+
+        config = PatchTSMixerConfig(**params)
+
+        mdl = PatchTSMixerForRegression(config)
+        output = mdl(self.__class__.data, future_values=self.__class__.correct_regression_output)
+        self.assertEqual(
+            output.prediction_outputs[0].shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(
+            output.prediction_outputs[1].shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+        if config.loss == "nll":
+            samples = mdl.generate(self.__class__.data)
+            ref_samples = self.__class__.correct_regression_output.unsqueeze(1).expand(
+                -1, params["num_parallel_samples"], -1
+            )
+            self.assertEqual(samples.sequences.shape, ref_samples.shape)
+
+    def test_regression_full_distribute_2(self):
+        params = self.__class__.params.copy()
+        params.update(loss="nll", distribution_output="student_t")
+
+        config = PatchTSMixerConfig(**params)
+
+        mdl = PatchTSMixerForRegression(config)
+        output = mdl(self.__class__.data, future_values=self.__class__.correct_regression_output)
+        self.assertEqual(
+            output.prediction_outputs[0].shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(
+            output.prediction_outputs[1].shape,
+            self.__class__.correct_regression_output.shape,
+        )
+        self.assertEqual(output.last_hidden_state.shape, self.__class__.enc_output.shape)
+        self.assertEqual(output.loss.item() < 100, True)
+
+        if config.loss == "nll":
+            samples = mdl.generate(self.__class__.data)
+            ref_samples = self.__class__.correct_regression_output.unsqueeze(1).expand(
+                -1, params["num_parallel_samples"], -1
+            )
+            self.assertEqual(samples.sequences.shape, ref_samples.shape)
diff --git a/tests/models/patchtst/__init__.py b/tests/models/patchtst/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/patchtst/test_modeling_patchtst.py b/tests/models/patchtst/test_modeling_patchtst.py
new file mode 100644
index 00000000000000..beb122849eb083
--- /dev/null
+++ b/tests/models/patchtst/test_modeling_patchtst.py
@@ -0,0 +1,385 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch PatchTST model. """
+
+import inspect
+import random
+import tempfile
+import unittest
+
+from huggingface_hub import hf_hub_download
+
+from transformers import is_torch_available
+from transformers.models.auto import get_values
+from transformers.testing_utils import is_flaky, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+TOLERANCE = 1e-4
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING,
+        MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING,
+        PatchTSTConfig,
+        PatchTSTForClassification,
+        PatchTSTForPrediction,
+        PatchTSTForPretraining,
+        PatchTSTForRegression,
+        PatchTSTModel,
+    )
+
+
+@require_torch
+class PatchTSTModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=13,
+        prediction_length=7,
+        context_length=14,
+        patch_length=5,
+        patch_stride=5,
+        num_input_channels=1,
+        num_time_features=1,
+        is_training=True,
+        hidden_size=16,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        intermediate_size=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        distil=False,
+        seed=42,
+        num_targets=2,
+        mask_type="random",
+        random_mask_ratio=0,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.prediction_length = prediction_length
+        self.context_length = context_length
+        self.patch_length = patch_length
+        self.patch_stride = patch_stride
+        self.num_input_channels = num_input_channels
+        self.num_time_features = num_time_features
+        self.is_training = is_training
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.mask_type = mask_type
+        self.random_mask_ratio = random_mask_ratio
+
+        self.seed = seed
+        self.num_targets = num_targets
+        self.distil = distil
+        self.num_patches = (max(self.context_length, self.patch_length) - self.patch_length) // self.patch_stride + 1
+        # define seq_length so that it can pass the test_attention_outputs
+        self.seq_length = self.num_patches
+
+    def get_config(self):
+        return PatchTSTConfig(
+            prediction_length=self.prediction_length,
+            patch_length=self.patch_length,
+            patch_stride=self.patch_stride,
+            num_input_channels=self.num_input_channels,
+            d_model=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            ffn_dim=self.intermediate_size,
+            dropout=self.hidden_dropout_prob,
+            attention_dropout=self.attention_probs_dropout_prob,
+            context_length=self.context_length,
+            activation_function=self.hidden_act,
+            seed=self.seed,
+            num_targets=self.num_targets,
+            mask_type=self.mask_type,
+            random_mask_ratio=self.random_mask_ratio,
+        )
+
+    def prepare_patchtst_inputs_dict(self, config):
+        _past_length = config.context_length
+        # bs, num_input_channels, num_patch, patch_len
+
+        # [bs x seq_len x num_input_channels]
+        past_values = floats_tensor([self.batch_size, _past_length, self.num_input_channels])
+
+        future_values = floats_tensor([self.batch_size, config.prediction_length, self.num_input_channels])
+
+        inputs_dict = {
+            "past_values": past_values,
+            "future_values": future_values,
+        }
+        return inputs_dict
+
+    def prepare_config_and_inputs(self):
+        config = self.get_config()
+        inputs_dict = self.prepare_patchtst_inputs_dict(config)
+        return config, inputs_dict
+
+    def prepare_config_and_inputs_for_common(self):
+        config, inputs_dict = self.prepare_config_and_inputs()
+        return config, inputs_dict
+
+
+@require_torch
+class PatchTSTModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    all_model_classes = (
+        (
+            PatchTSTModel,
+            PatchTSTForPrediction,
+            PatchTSTForPretraining,
+            PatchTSTForClassification,
+            PatchTSTForRegression,
+        )
+        if is_torch_available()
+        else ()
+    )
+
+    pipeline_model_mapping = {"feature-extraction": PatchTSTModel} if is_torch_available() else {}
+    is_encoder_decoder = False
+    test_pruning = False
+    test_head_masking = False
+    test_missing_keys = True
+    test_torchscript = False
+    test_inputs_embeds = False
+    test_model_common_attributes = False
+
+    test_resize_embeddings = True
+    test_resize_position_embeddings = False
+    test_mismatched_shapes = True
+    test_model_parallel = False
+    has_attentions = True
+
+    def setUp(self):
+        self.model_tester = PatchTSTModelTester(self)
+        self.config_tester = ConfigTester(
+            self,
+            config_class=PatchTSTConfig,
+            has_text_modality=False,
+            prediction_length=self.model_tester.prediction_length,
+        )
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def _prepare_for_class(self, inputs_dict, model_class, return_labels=False):
+        inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels)
+
+        #  if PatchTSTForPretraining
+        if model_class == PatchTSTForPretraining:
+            inputs_dict.pop("future_values")
+        # else if classification model:
+        elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING):
+            rng = random.Random(self.model_tester.seed)
+            labels = ids_tensor([self.model_tester.batch_size], self.model_tester.num_targets, rng=rng)
+            inputs_dict["target_values"] = labels
+            inputs_dict.pop("future_values")
+        elif model_class in get_values(MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING):
+            rng = random.Random(self.model_tester.seed)
+            target_values = floats_tensor([self.model_tester.batch_size, self.model_tester.num_targets], rng=rng)
+            inputs_dict["target_values"] = target_values
+            inputs_dict.pop("future_values")
+        return inputs_dict
+
+    def test_save_load_strict(self):
+        config, _ = self.model_tester.prepare_config_and_inputs()
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model2, info = model_class.from_pretrained(tmpdirname, output_loading_info=True)
+            self.assertEqual(info["missing_keys"], [])
+
+    def test_hidden_states_output(self):
+        def check_hidden_states_output(inputs_dict, config, model_class):
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            hidden_states = outputs.hidden_states
+
+            expected_num_layers = getattr(
+                self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers
+            )
+            self.assertEqual(len(hidden_states), expected_num_layers)
+
+            num_patch = self.model_tester.num_patches
+            self.assertListEqual(
+                list(hidden_states[0].shape[-2:]),
+                [num_patch, self.model_tester.hidden_size],
+            )
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_hidden_states"] = True
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+            # check that output_hidden_states also work using config
+            del inputs_dict["output_hidden_states"]
+            config.output_hidden_states = True
+
+            check_hidden_states_output(inputs_dict, config, model_class)
+
+    @unittest.skip(reason="we have no tokens embeddings")
+    def test_resize_tokens_embeddings(self):
+        pass
+
+    def test_model_main_input_name(self):
+        model_signature = inspect.signature(getattr(PatchTSTModel, "forward"))
+        # The main input is the name of the argument after `self`
+        observed_main_input_name = list(model_signature.parameters.keys())[1]
+        self.assertEqual(PatchTSTModel.main_input_name, observed_main_input_name)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            if model_class == PatchTSTForPretraining:
+                expected_arg_names = [
+                    "past_values",
+                    "past_observed_mask",
+                ]
+            elif model_class in get_values(MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING) or model_class in get_values(
+                MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING
+            ):
+                expected_arg_names = ["past_values", "target_values", "past_observed_mask"]
+            else:
+                expected_arg_names = [
+                    "past_values",
+                    "past_observed_mask",
+                    "future_values",
+                ]
+
+            expected_arg_names.extend(
+                [
+                    "output_hidden_states",
+                    "output_attentions",
+                    "return_dict",
+                ]
+            )
+
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @is_flaky()
+    def test_retain_grad_hidden_states_attentions(self):
+        super().test_retain_grad_hidden_states_attentions()
+
+
+def prepare_batch(repo_id="hf-internal-testing/etth1-hourly-batch", file="train-batch.pt"):
+    file = hf_hub_download(repo_id=repo_id, filename=file, repo_type="dataset")
+    batch = torch.load(file, map_location=torch_device)
+    return batch
+
+
+# Note: Pretrained model is not yet downloadable.
+@require_torch
+@slow
+class PatchTSTModelIntegrationTests(unittest.TestCase):
+    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
+    def test_pretrain_head(self):
+        model = PatchTSTForPretraining.from_pretrained("namctin/patchtst_etth1_pretrain").to(torch_device)
+        batch = prepare_batch()
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(past_values=batch["past_values"].to(torch_device)).prediction_output
+        num_patch = (
+            max(model.config.context_length, model.config.patch_length) - model.config.patch_length
+        ) // model.config.patch_stride + 1
+        expected_shape = torch.Size([64, model.config.num_input_channels, num_patch, model.config.patch_length])
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[[-0.0173]], [[-1.0379]], [[-0.1030]], [[0.3642]], [[0.1601]], [[-1.3136]], [[0.8780]]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[0, :7, :1, :1], expected_slice, atol=TOLERANCE))
+
+    # Publishing of pretrained weights are under internal review. Pretrained model is not yet downloadable.
+    def test_prediction_head(self):
+        model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast").to(torch_device)
+        batch = prepare_batch(file="test-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            output = model(
+                past_values=batch["past_values"].to(torch_device),
+                future_values=batch["future_values"].to(torch_device),
+            ).prediction_outputs
+        expected_shape = torch.Size([64, model.config.prediction_length, model.config.num_input_channels])
+        self.assertEqual(output.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.5142, 0.6928, 0.6118, 0.5724, -0.3735, -0.1336, -0.7124]],
+            device=torch_device,
+        )
+        self.assertTrue(torch.allclose(output[0, :1, :7], expected_slice, atol=TOLERANCE))
+
+    def test_prediction_generation(self):
+        model = PatchTSTForPrediction.from_pretrained("namctin/patchtst_etth1_forecast").to(torch_device)
+        batch = prepare_batch(file="test-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            outputs = model.generate(past_values=batch["past_values"].to(torch_device))
+        expected_shape = torch.Size((64, 1, model.config.prediction_length, model.config.num_input_channels))
+
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.4075, 0.3716, 0.4786, 0.2842, -0.3107, -0.0569, -0.7489]],
+            device=torch_device,
+        )
+        mean_prediction = outputs.sequences.mean(dim=1)
+        self.assertTrue(torch.allclose(mean_prediction[0, -1:], expected_slice, atol=TOLERANCE))
+
+    def test_regression_generation(self):
+        model = PatchTSTForRegression.from_pretrained("namctin/patchtst_etth1_regression").to(torch_device)
+        batch = prepare_batch(file="test-batch.pt")
+
+        torch.manual_seed(0)
+        with torch.no_grad():
+            outputs = model.generate(past_values=batch["past_values"].to(torch_device))
+        expected_shape = torch.Size((64, model.config.num_parallel_samples, model.config.num_targets))
+        self.assertEqual(outputs.sequences.shape, expected_shape)
+
+        expected_slice = torch.tensor(
+            [[0.3228, 0.4320, 0.4591, 0.4066, -0.3461, 0.3094, -0.8426]],
+            device=torch_device,
+        )
+        mean_prediction = outputs.sequences.mean(dim=1)
+
+        self.assertTrue(torch.allclose(mean_prediction[0, -1:], expected_slice, rtol=TOLERANCE))
diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py
index 0ffb999145be22..864db992772772 100644
--- a/tests/models/persimmon/test_modeling_persimmon.py
+++ b/tests/models/persimmon/test_modeling_persimmon.py
@@ -32,7 +32,7 @@
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor, random_attention_mask
+from ...test_modeling_common import ModelTesterMixin, ids_tensor
 from ...test_pipeline_mixin import PipelineTesterMixin
 
 
@@ -104,7 +104,7 @@ def prepare_config_and_inputs(self):
 
         input_mask = None
         if self.use_input_mask:
-            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+            input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device)
 
         token_type_ids = None
         if self.use_token_type_ids:
diff --git a/tests/models/phi/test_modeling_phi.py b/tests/models/phi/test_modeling_phi.py
index 93c5ca85e9a4a3..516dd1ee626e7f 100644
--- a/tests/models/phi/test_modeling_phi.py
+++ b/tests/models/phi/test_modeling_phi.py
@@ -18,8 +18,17 @@
 
 import unittest
 
+import pytest
+
 from transformers import PhiConfig, is_torch_available
-from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.testing_utils import (
+    require_bitsandbytes,
+    require_flash_attn,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
 
 from ...generation.test_utils import GenerationTesterMixin
 from ...test_configuration_common import ConfigTester
@@ -31,6 +40,7 @@
     import torch
 
     from transformers import (
+        AutoTokenizer,
         PhiForCausalLM,
         PhiForSequenceClassification,
         PhiForTokenClassification,
@@ -38,7 +48,6 @@
     )
 
 
-# Copied from tests.models.llama.test_modeling_llama.LlamaModelTester with Llama->Phi
 class PhiModelTester:
     def __init__(
         self,
@@ -288,6 +297,12 @@ class PhiModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
     test_headmasking = False
     test_pruning = False
 
+    # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79292/workflows/fa2ba644-8953-44a6-8f67-ccd69ca6a476/jobs/1012905
+    def is_pipeline_test_to_skip(
+        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
+    ):
+        return True
+
     # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.setUp with Llama->Phi
     def setUp(self):
         self.model_tester = PhiModelTester(self)
@@ -345,6 +360,43 @@ def test_phi_sequence_classification_model_for_multi_label(self):
         result = model(input_ids, attention_mask=attention_mask, labels=sequence_labels)
         self.assertEqual(result.logits.shape, (self.model_tester.batch_size, self.model_tester.num_labels))
 
+    @require_flash_attn
+    @require_torch_gpu
+    @require_bitsandbytes
+    @pytest.mark.flash_attn_test
+    @slow
+    # Copied from tests.models.llama.test_modeling_llama.LlamaModelTest.test_flash_attn_2_generate_padding_right with LlamaForCausalLM->PhiForCausalLM,LlamaTokenizer->AutoTokenizer,meta-llama/Llama-2-7b-hf->susnato/phi-1_5_dev
+    def test_flash_attn_2_generate_padding_right(self):
+        """
+        Overwritting the common test as the test is flaky on tiny models
+        """
+        model = PhiForCausalLM.from_pretrained(
+            "susnato/phi-1_5_dev",
+            load_in_4bit=True,
+            device_map={"": 0},
+        )
+
+        tokenizer = AutoTokenizer.from_pretrained("susnato/phi-1_5_dev")
+
+        texts = ["hi", "Hello this is a very long sentence"]
+
+        tokenizer.padding_side = "right"
+        tokenizer.pad_token = tokenizer.eos_token
+
+        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
+
+        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_native = tokenizer.batch_decode(output_native)
+
+        model = PhiForCausalLM.from_pretrained(
+            "susnato/phi-1_5_dev", load_in_4bit=True, device_map={"": 0}, attn_implementation="flash_attention_2"
+        )
+
+        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
+        output_fa_2 = tokenizer.batch_decode(output_fa_2)
+
+        self.assertListEqual(output_native, output_fa_2)
+
 
 @slow
 @require_torch
diff --git a/tests/models/poolformer/test_modeling_poolformer.py b/tests/models/poolformer/test_modeling_poolformer.py
index 99667d6f1b4523..070564e718bf1e 100644
--- a/tests/models/poolformer/test_modeling_poolformer.py
+++ b/tests/models/poolformer/test_modeling_poolformer.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch PoolFormer model. """
 
 
-import inspect
 import unittest
 
 from transformers import is_torch_available, is_vision_available
@@ -208,18 +207,6 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in POOLFORMER_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/pvt/test_modeling_pvt.py b/tests/models/pvt/test_modeling_pvt.py
index 04ce2153053188..e174b67a07887c 100644
--- a/tests/models/pvt/test_modeling_pvt.py
+++ b/tests/models/pvt/test_modeling_pvt.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Pvt model. """
 
 
-import inspect
 import unittest
 
 from transformers import is_torch_available, is_vision_available
@@ -253,18 +252,6 @@ def test_training(self):
             loss = model(**inputs).loss
             loss.backward()
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in PVT_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/rag/test_retrieval_rag.py b/tests/models/rag/test_retrieval_rag.py
index d4c119815c96f4..1cf155b39a9e2d 100644
--- a/tests/models/rag/test_retrieval_rag.py
+++ b/tests/models/rag/test_retrieval_rag.py
@@ -14,7 +14,6 @@
 
 import json
 import os
-import pickle
 import shutil
 import tempfile
 from unittest import TestCase
@@ -174,37 +173,6 @@ def get_dummy_custom_hf_index_retriever(self, from_disk: bool):
             )
         return retriever
 
-    def get_dummy_legacy_index_retriever(self):
-        dataset = Dataset.from_dict(
-            {
-                "id": ["0", "1"],
-                "text": ["foo", "bar"],
-                "title": ["Foo", "Bar"],
-                "embeddings": [np.ones(self.retrieval_vector_size + 1), 2 * np.ones(self.retrieval_vector_size + 1)],
-            }
-        )
-        dataset.add_faiss_index("embeddings", string_factory="Flat", metric_type=faiss.METRIC_INNER_PRODUCT)
-
-        index_file_name = os.path.join(self.tmpdirname, "hf_bert_base.hnswSQ8_correct_phi_128.c_index")
-        dataset.save_faiss_index("embeddings", index_file_name + ".index.dpr")
-        pickle.dump(dataset["id"], open(index_file_name + ".index_meta.dpr", "wb"))
-
-        passages_file_name = os.path.join(self.tmpdirname, "psgs_w100.tsv.pkl")
-        passages = {sample["id"]: [sample["text"], sample["title"]] for sample in dataset}
-        pickle.dump(passages, open(passages_file_name, "wb"))
-
-        config = RagConfig(
-            retrieval_vector_size=self.retrieval_vector_size,
-            question_encoder=DPRConfig().to_dict(),
-            generator=BartConfig().to_dict(),
-            index_name="legacy",
-            index_path=self.tmpdirname,
-        )
-        retriever = RagRetriever(
-            config, question_encoder_tokenizer=self.get_dpr_tokenizer(), generator_tokenizer=self.get_bart_tokenizer()
-        )
-        return retriever
-
     def test_canonical_hf_index_retriever_retrieve(self):
         n_docs = 1
         retriever = self.get_dummy_canonical_hf_index_retriever()
@@ -288,33 +256,6 @@ def test_custom_hf_index_retriever_save_and_from_pretrained_from_disk(self):
             out = retriever.retrieve(hidden_states, n_docs=1)
             self.assertTrue(out is not None)
 
-    def test_legacy_index_retriever_retrieve(self):
-        n_docs = 1
-        retriever = self.get_dummy_legacy_index_retriever()
-        hidden_states = np.array(
-            [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-        )
-        retrieved_doc_embeds, doc_ids, doc_dicts = retriever.retrieve(hidden_states, n_docs=n_docs)
-        self.assertEqual(retrieved_doc_embeds.shape, (2, n_docs, self.retrieval_vector_size))
-        self.assertEqual(len(doc_dicts), 2)
-        self.assertEqual(sorted(doc_dicts[0]), ["text", "title"])
-        self.assertEqual(len(doc_dicts[0]["text"]), n_docs)
-        self.assertEqual(doc_dicts[0]["text"][0], "bar")  # max inner product is reached with second doc
-        self.assertEqual(doc_dicts[1]["text"][0], "foo")  # max inner product is reached with first doc
-        self.assertListEqual(doc_ids.tolist(), [[1], [0]])
-
-    def test_legacy_hf_index_retriever_save_and_from_pretrained(self):
-        retriever = self.get_dummy_legacy_index_retriever()
-        with tempfile.TemporaryDirectory() as tmp_dirname:
-            retriever.save_pretrained(tmp_dirname)
-            retriever = RagRetriever.from_pretrained(tmp_dirname)
-            self.assertIsInstance(retriever, RagRetriever)
-            hidden_states = np.array(
-                [np.ones(self.retrieval_vector_size), -np.ones(self.retrieval_vector_size)], dtype=np.float32
-            )
-            out = retriever.retrieve(hidden_states, n_docs=1)
-            self.assertTrue(out is not None)
-
     @require_torch
     @require_tokenizers
     @require_sentencepiece
diff --git a/tests/models/regnet/test_modeling_regnet.py b/tests/models/regnet/test_modeling_regnet.py
index debd8271b5c640..9840575f317ecd 100644
--- a/tests/models/regnet/test_modeling_regnet.py
+++ b/tests/models/regnet/test_modeling_regnet.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch RegNet model. """
 
 
-import inspect
 import unittest
 
 from transformers import RegNetConfig
@@ -161,18 +160,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py
new file mode 100644
index 00000000000000..12c43643be7a48
--- /dev/null
+++ b/tests/models/rembert/test_tokenization_rembert.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the RemBert tokenizer. """
+
+
+import tempfile
+import unittest
+
+from tests.test_tokenization_common import AddedToken, TokenizerTesterMixin
+from transformers import RemBertTokenizer, RemBertTokenizerFast
+from transformers.testing_utils import get_tests_dir, require_sentencepiece, require_tokenizers
+
+
+SENTENCEPIECE_UNDERLINE = "▁"
+SPIECE_UNDERLINE = SENTENCEPIECE_UNDERLINE  # Kept for backward compatibility
+
+SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
+
+
+@require_sentencepiece
+@require_tokenizers
+class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
+    tokenizer_class = RemBertTokenizer
+    rust_tokenizer_class = RemBertTokenizerFast
+    space_between_special_tokens = True
+    test_rust_tokenizer = True
+    test_sentencepiece_ignore_case = True
+    pre_trained_model_path = "google/rembert"
+
+    def setUp(self):
+        super().setUp()
+
+        tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    # Copied from ReformerTokenizationTest.get_input_output_texts
+    def get_input_output_texts(self, tokenizer):
+        input_text = "this is a test"
+        output_text = "this is a test"
+        return input_text, output_text
+
+    def test_get_vocab(self):
+        vocab_keys = list(self.get_tokenizer().get_vocab().keys())
+        self.assertEqual(vocab_keys[0], "<unk>")
+        self.assertEqual(vocab_keys[1], "<s>")
+
+        self.assertEqual(vocab_keys[5], "▁the")
+        self.assertEqual(vocab_keys[2], "</s>")
+
+    def test_vocab_size(self):
+        self.assertEqual(self.get_tokenizer().vocab_size, 1_000)
+
+    def test_full_tokenizer(self):
+        tokenizer = RemBertTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize("This is a test")
+        self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens),
+            [285, 46, 10, 170, 382],
+        )
+
+        tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.")
+        self.assertListEqual( tokens,  [SPIECE_UNDERLINE + "I",SPIECE_UNDERLINE + "was",SPIECE_UNDERLINE + "b","or","n",SPIECE_UNDERLINE + "in",SPIECE_UNDERLINE + "","9","2","0","0","0",",",SPIECE_UNDERLINE + "and",SPIECE_UNDERLINE + "this",SPIECE_UNDERLINE + "is",SPIECE_UNDERLINE + "f","al","s","é",".",],)  # fmt: skip
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(ids, [8, 21, 84, 55, 24, 19, 7, 0, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 0, 4])
+
+    def test_encode_decode_round_trip(self):
+        tokenizer = RemBertTokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        text = "清水寺は京都にある。"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["▁", "清水寺は京都にある。"])
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(encoded_string, [1000, 7, 0, 1001])
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEquals(decode_text, text)
+
+        text = "That's awesome! 🤩 #HuggingFace,  🌟 Have a great day! 🌈"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual( tokens, ['▁That', "'", 's', '▁a', 'w', 'es', 'ome', '!', '▁', '🤩', '▁', '#', 'H', 'u', 'g', 'g', 'ing', 'F', 'a', 'ce', ',', '▁', '🌟', '▁H', 'a', 've', '▁a', '▁great', '▁day', '!', '▁', '🌈'])  # fmt: skip
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEquals(decode_text, "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈")
+
+        text = "In the sky up above"
+        tokens = tokenizer._tokenize(text)
+        self.assertListEqual(tokens, ["▁In", "▁the", "▁s", "k", "y", "▁up", "▁a", "b", "o", "ve"])  # fmt: skip
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(encoded_string, [1000, 388, 5, 47, 45, 30, 118, 10, 65, 20, 123, 1001])
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, decode_text)
+
+        text = "The cat. . Sat <s>.In a room"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(
+            tokens, ["▁The", "▁c", "at", ".", "▁", ".", "▁S", "at", "▁", "<", "s", ">", ".", "I", "n", "▁a", "▁room"]
+        )
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(
+            encoded_string, [1000, 68, 69, 76, 4, 7, 4, 166, 76, 7, 0, 6, 0, 4, 100, 24, 10, 136, 1001]
+        )
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, decode_text)
+
+        text = "Invoice #12345, dated 2023-12-01, is due on 2024-01-15."
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ['▁In', 'v', 'o', 'ic', 'e', '▁', '#', '1', '2', '34', '5', ',', '▁da', 'ted', '▁', '2', '0', '2', '3', '-', '1', '2', '-', '0', '1', ',', '▁is', '▁d', 'u', 'e', '▁on', '▁', '2', '0', '2', '4', '-', '0', '1', '-', '1', '5', '.'])  # fmt: skip
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(encoded_string, [1000, 388, 83, 20, 113, 15, 7, 0, 356, 602, 0, 555, 3, 417, 273, 7, 602, 347, 602, 0, 33, 356, 602, 33, 347, 356, 3, 46, 229, 51, 15, 59, 7, 602, 347, 602, 0, 33, 347, 356, 33, 356, 555, 4, 1001])  # fmt: skip
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, decode_text)
+
+        text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit..."
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens,  ['▁', 'L', 'or', 'em', '▁', 'i', 'p', 's', 'um', '▁do', 'l', 'or', '▁sit', '▁am', 'e', 't', ',', '▁con', 'se', 'c', 'te', 't', 'ur', '▁a', 'd', 'i', 'p', 'is', 'c', 'ing', '▁', 'el', 'it', '.', '.', '.'])  # fmt: skip
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual( encoded_string,  [1000, 7, 279, 55, 300, 7, 23, 29, 6, 155, 92, 27, 55, 615, 219, 15, 14, 3, 247, 114, 28, 181, 14, 108, 10, 16, 23, 29, 125, 28, 17, 7, 168, 137, 4, 4, 4, 1001] )  # fmt: skip
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, decode_text)
+
+        # for multiple language in one sentence
+        text = "Bonjour! Hello! こんにちは!"
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ["▁B", "on", "j", "o", "ur", "!", "▁He", "ll", "o", "!", "▁", "こんにちは", "!"])
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(encoded_string, [1000, 295, 109, 999, 20, 108, 146, 156, 86, 20, 146, 7, 0, 146, 1001])
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual(text, decode_text)
+
+        text = "Extra spaces\tand\nline breaks\r\nshould be handled."
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, ['▁E', 'x', 't', 'r', 'a', '▁sp', 'a', 'ce', 's', '▁and', '▁line', '▁b', 're', 'a', 'k', 's', '▁should', '▁be', '▁hand', 'led', '.'])  # fmt: skip
+        encoded_string = tokenizer.encode(text)
+        self.assertListEqual(
+            encoded_string,
+            [1000, 454, 297, 14, 35, 18, 277, 18, 133, 6, 12, 485, 84, 56, 18, 45, 6, 173, 36, 363, 338, 4, 1001],
+        )
+        decode_text = tokenizer.convert_tokens_to_string(tokens)
+        self.assertEqual("Extra spaces and line breaks should be handled.", decode_text)
+
+    def test_sequence_builders(self):
+        tokenizer = RemBertTokenizer(SAMPLE_VOCAB)
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        assert encoded_sentence == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id]
+        assert encoded_pair == [tokenizer.cls_token_id] + text + [tokenizer.sep_token_id] + text_2 + [
+            tokenizer.sep_token_id
+        ]
+
+    def test_added_tokens_serialization(self):
+        # Utility to test the added vocab
+        def _test_added_vocab_and_eos(expected, tokenizer_class, expected_eos, temp_dir):
+            tokenizer = tokenizer_class.from_pretrained(temp_dir)
+            self.assertTrue(str(expected_eos) not in tokenizer.additional_special_tokens)
+            self.assertIn(new_eos, tokenizer.added_tokens_decoder.values())
+            self.assertEqual(tokenizer.added_tokens_decoder[tokenizer.eos_token_id], new_eos)
+            self.assertDictEqual(expected, tokenizer.added_tokens_decoder)
+            return tokenizer
+
+        new_eos = AddedToken("[NEW_EOS]", rstrip=False, lstrip=True, normalized=False, special=True)
+        new_masked_token = AddedToken("[MASK]", lstrip=True, rstrip=False, normalized=False)
+        for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
+            with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
+                # Load a slow tokenizer from the hub, init with the new token for fast to also include it
+                tokenizer = self.tokenizer_class.from_pretrained(
+                    pretrained_name, eos_token=new_eos, mask_token=new_masked_token
+                )
+                EXPECTED_ADDED_TOKENS_DECODER = tokenizer.added_tokens_decoder
+                with self.subTest("Hub -> Slow: Test loading a slow tokenizer from the hub)"):
+                    self.assertEqual(tokenizer._eos_token, new_eos)
+                    self.assertIn(new_eos, list(tokenizer.added_tokens_decoder.values()))
+
+                with tempfile.TemporaryDirectory() as tmp_dir_2:
+                    tokenizer.save_pretrained(tmp_dir_2)
+                    with self.subTest(
+                        "Hub -> Slow -> Slow: Test saving this slow tokenizer and reloading it in the fast class"
+                    ):
+                        _test_added_vocab_and_eos(
+                            EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_2
+                        )
+
+                    if self.rust_tokenizer_class is not None:
+                        with self.subTest(
+                            "Hub -> Slow -> Fast: Test saving this slow tokenizer and reloading it in the fast class"
+                        ):
+                            tokenizer_fast = _test_added_vocab_and_eos(
+                                EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_2
+                            )
+                            with tempfile.TemporaryDirectory() as tmp_dir_3:
+                                tokenizer_fast.save_pretrained(tmp_dir_3)
+                                with self.subTest(
+                                    "Hub -> Slow -> Fast -> Fast: Test saving this fast tokenizer and reloading it in the fast class"
+                                ):
+                                    _test_added_vocab_and_eos(
+                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
+                                    )
+
+                                with self.subTest(
+                                    "Hub -> Slow -> Fast -> Slow: Test saving this slow tokenizer and reloading it in the slow class"
+                                ):
+                                    _test_added_vocab_and_eos(
+                                        EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_3
+                                    )
+
+                with self.subTest("Hub -> Fast: Test loading a fast tokenizer from the hub)"):
+                    if self.rust_tokenizer_class is not None:
+                        tokenizer_fast = self.rust_tokenizer_class.from_pretrained(pretrained_name, eos_token=new_eos)
+                        self.assertEqual(tokenizer_fast._eos_token, new_eos)
+                        self.assertIn(new_eos, list(tokenizer_fast.added_tokens_decoder.values()))
+                        # We can't test the following because for BC we kept the default rstrip lstrip in slow not fast. Will comment once normalization is alright
+                        with self.subTest("Hub -> Fast == Hub -> Slow: make sure slow and fast tokenizer match"):
+                            self.assertDictEqual(EXPECTED_ADDED_TOKENS_DECODER, tokenizer_fast.added_tokens_decoder)
+
+                        EXPECTED_ADDED_TOKENS_DECODER = tokenizer_fast.added_tokens_decoder
+                        with tempfile.TemporaryDirectory() as tmp_dir_4:
+                            tokenizer_fast.save_pretrained(tmp_dir_4)
+                            with self.subTest("Hub -> Fast -> Fast: saving Fast1 locally and loading"):
+                                _test_added_vocab_and_eos(
+                                    EXPECTED_ADDED_TOKENS_DECODER, self.rust_tokenizer_class, new_eos, tmp_dir_4
+                                )
+
+                            with self.subTest("Hub -> Fast -> Slow: saving Fast1 locally and loading"):
+                                _test_added_vocab_and_eos(
+                                    EXPECTED_ADDED_TOKENS_DECODER, self.tokenizer_class, new_eos, tmp_dir_4
+                                )
diff --git a/tests/models/resnet/test_modeling_resnet.py b/tests/models/resnet/test_modeling_resnet.py
index 4cfa18d6bcf456..bae9eb6d24c8cb 100644
--- a/tests/models/resnet/test_modeling_resnet.py
+++ b/tests/models/resnet/test_modeling_resnet.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ResNet model. """
 
 
-import inspect
 import unittest
 
 from transformers import ResNetConfig
@@ -206,18 +205,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/sam/test_modeling_sam.py b/tests/models/sam/test_modeling_sam.py
index a84e3695c13fa1..3e63edb23a0c6d 100644
--- a/tests/models/sam/test_modeling_sam.py
+++ b/tests/models/sam/test_modeling_sam.py
@@ -16,7 +16,6 @@
 
 
 import gc
-import inspect
 import unittest
 
 import requests
@@ -338,18 +337,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
index ab7a48694d2fd7..cddaddda183cd7 100644
--- a/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
+++ b/tests/models/seamless_m4t/test_modeling_seamless_m4t.py
@@ -16,7 +16,6 @@
 
 
 import copy
-import inspect
 import tempfile
 import unittest
 
@@ -34,6 +33,7 @@
     ids_tensor,
     random_attention_mask,
 )
+from ...test_pipeline_mixin import PipelineTesterMixin
 
 
 if is_torch_available():
@@ -478,10 +478,6 @@ def test_model_weights_reload_no_missing_tied_weights(self):
     def test_save_load_fast_init_to_base(self):
         pass
 
-    @unittest.skip(reason="The speech encoder doesn't support head masking")
-    def test_generate_with_head_masking(self):
-        pass
-
     @unittest.skip(reason="SeamlessM4TModel can takes input_ids or input_features")
     def test_forward_signature(self):
         pass
@@ -616,7 +612,9 @@ def test_attention_outputs(self):
 
 
 @require_torch
-class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+class SeamlessM4TModelWithTextInputTest(
+    ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase
+):
     is_encoder_decoder = True
     fx_compatible = False
     test_missing_keys = False
@@ -636,6 +634,19 @@ class SeamlessM4TModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin,
         else ()
     )
     all_generative_model_classes = (SeamlessM4TForTextToText,) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {
+            "automatic-speech-recognition": SeamlessM4TForSpeechToText,
+            "conversational": SeamlessM4TForTextToText,
+            "feature-extraction": SeamlessM4TModel,
+            "summarization": SeamlessM4TForTextToText,
+            "text-to-audio": SeamlessM4TForTextToSpeech,
+            "text2text-generation": SeamlessM4TForTextToText,
+            "translation": SeamlessM4TForTextToText,
+        }
+        if is_torch_available()
+        else {}
+    )
 
     def setUp(self):
         self.model_tester = SeamlessM4TModelTester(self, input_modality="text")
@@ -698,43 +709,6 @@ def test_initialization(self):
     def test_model_weights_reload_no_missing_tied_weights(self):
         pass
 
-    def test_generate_with_head_masking(self):
-        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
-        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
-        for model_class in self.all_generative_model_classes:
-            config, input_ids, attention_mask, max_length = self._get_input_ids_and_config()
-
-            model = model_class(config).to(torch_device).eval()
-
-            head_masking = {
-                "head_mask": torch.zeros(config.encoder_layers, config.encoder_attention_heads, device=torch_device),
-                "decoder_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-                "cross_attn_head_mask": torch.zeros(
-                    config.decoder_layers, config.decoder_attention_heads, device=torch_device
-                ),
-            }
-
-            signature = inspect.signature(model.forward)
-            # We want to test only models where encoder/decoder head masking is implemented
-            if not set(head_masking.keys()) < {*signature.parameters.keys()}:
-                continue
-
-            for attn_name, (name, mask) in zip(attention_names, head_masking.items()):
-                out = model.generate(
-                    input_ids,
-                    attention_mask=attention_mask,
-                    num_beams=1,
-                    output_attentions=True,
-                    return_dict_in_generate=True,
-                    remove_invalid_values=True,
-                    **{name: mask},
-                )
-                # We check the state of decoder_attentions and cross_attentions just from the last step
-                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
-                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
-
     @unittest.skip(reason="SeamlessM4TModel can take input_ids or input_features")
     def test_forward_signature(self):
         pass
diff --git a/tests/models/seamless_m4t_v2/__init__.py b/tests/models/seamless_m4t_v2/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
new file mode 100644
index 00000000000000..8627220c71aa51
--- /dev/null
+++ b/tests/models/seamless_m4t_v2/test_modeling_seamless_m4t_v2.py
@@ -0,0 +1,1165 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch SeamlessM4Tv2 model. """
+
+
+import copy
+import tempfile
+import unittest
+
+from transformers import SeamlessM4Tv2Config, is_speech_available, is_torch_available
+from transformers.testing_utils import require_torch, slow, torch_device
+from transformers.trainer_utils import set_seed
+from transformers.utils import cached_property
+
+from ...generation.test_utils import GenerationTesterMixin
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import (
+        SeamlessM4Tv2ForSpeechToSpeech,
+        SeamlessM4Tv2ForSpeechToText,
+        SeamlessM4Tv2ForTextToSpeech,
+        SeamlessM4Tv2ForTextToText,
+        SeamlessM4Tv2Model,
+    )
+    from transformers.models.seamless_m4t_v2.modeling_seamless_m4t_v2 import (
+        SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST,
+    )
+
+if is_speech_available():
+    from transformers import SeamlessM4TProcessor
+
+
+class SeamlessM4Tv2ModelTester:
+    def __init__(
+        self,
+        parent,
+        input_modality="speech",
+        batch_size=2,
+        seq_length=4,
+        is_training=True,
+        use_input_mask=True,
+        use_token_type_ids=True,
+        use_labels=True,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        initializer_range=0.02,
+        max_new_tokens=None,
+        num_labels=3,
+        num_choices=4,
+        scope=None,
+        vocab_size=20,
+        t2u_vocab_size=20,
+        hidden_size=6,
+        num_hidden_layers=2,
+        intermediate_size=6,
+        max_position_embeddings=256,
+        encoder_layers=2,
+        decoder_layers=2,
+        encoder_ffn_dim=6,
+        decoder_ffn_dim=6,
+        t2u_encoder_layers=2,
+        t2u_decoder_layers=2,
+        t2u_encoder_ffn_dim=6,
+        t2u_decoder_ffn_dim=6,
+        num_heads=2,
+        vocoder_num_spkrs=5,
+        vocoder_num_langs=5,
+        upsample_initial_channel=32,
+        unit_embed_dim=25,
+        spkr_embed_dim=6,
+        lang_embed_dim=6,
+        num_conv_pos_embeddings=8,
+        unit_hifi_gan_vocab_size=20,
+        t2u_num_langs=0,
+        t2u_offset_tgt_lang=0,
+        vocoder_offset=0,
+        t2u_variance_predictor_hidden_dim=4,
+        char_vocab_size=4,
+        left_max_position_embeddings=2,
+        right_max_position_embeddings=1,
+        speech_encoder_chunk_size=2,
+        speech_encoder_left_chunk_num=1,
+    ):
+        self.parent = parent
+        self.input_modality = input_modality
+
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.is_training = is_training
+        self.use_input_mask = use_input_mask
+        self.use_token_type_ids = use_token_type_ids
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.num_labels = num_labels
+        self.num_choices = num_choices
+        self.scope = scope
+
+        self.vocab_size = vocab_size
+        self.t2u_vocab_size = t2u_vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.max_position_embeddings = max_position_embeddings
+        self.encoder_layers = encoder_layers
+        self.decoder_layers = decoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.t2u_encoder_layers = t2u_encoder_layers
+        self.t2u_decoder_layers = t2u_decoder_layers
+        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
+        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
+        self.num_heads = num_heads
+        self.num_attention_heads = num_heads
+
+        self.vocoder_num_spkrs = vocoder_num_spkrs
+        self.vocoder_num_langs = vocoder_num_langs
+        self.upsample_initial_channel = upsample_initial_channel
+        self.unit_embed_dim = unit_embed_dim
+        self.spkr_embed_dim = spkr_embed_dim
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.lang_embed_dim = lang_embed_dim
+
+        self.max_new_tokens = max_new_tokens
+
+        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
+        self.t2u_num_langs = t2u_num_langs
+        self.t2u_offset_tgt_lang = t2u_offset_tgt_lang
+        self.vocoder_offset = vocoder_offset
+
+        self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim
+        self.char_vocab_size = char_vocab_size
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+        self.speech_encoder_chunk_size = speech_encoder_chunk_size
+        self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num
+
+    def prepare_config_and_inputs(self):
+        if self.input_modality == "text":
+            inputs = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
+        else:
+            inputs = ids_tensor([self.batch_size, self.seq_length, 160], self.vocab_size - 1).float()
+
+        input_mask = None
+        if self.use_input_mask:
+            input_mask = random_attention_mask([self.batch_size, self.seq_length])
+
+        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size - 1)
+
+        lm_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+
+        config = self.get_config()
+
+        return config, inputs, decoder_input_ids, input_mask, lm_labels
+
+    def get_config(self):
+        return SeamlessM4Tv2Config(
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            initializer_range=self.initializer_range,
+            vocab_size=self.vocab_size,
+            t2u_vocab_size=self.t2u_vocab_size,
+            hidden_size=self.hidden_size,
+            speech_encoder_layers=self.num_heads,
+            speech_encoder_intermediate_size=self.intermediate_size,
+            max_position_embeddings=self.max_position_embeddings,
+            encoder_layers=self.encoder_layers,
+            decoder_layers=self.decoder_layers,
+            encoder_ffn_dim=self.encoder_ffn_dim,
+            decoder_ffn_dim=self.decoder_ffn_dim,
+            t2u_encoder_layers=self.t2u_encoder_layers,
+            t2u_decoder_layers=self.t2u_decoder_layers,
+            t2u_encoder_ffn_dim=self.t2u_encoder_ffn_dim,
+            t2u_decoder_ffn_dim=self.t2u_decoder_ffn_dim,
+            num_attention_heads=self.num_heads,
+            encoder_attention_heads=self.num_heads,
+            decoder_attention_heads=self.num_heads,
+            t2u_encoder_attention_heads=self.num_heads,
+            t2u_decoder_attention_heads=self.num_heads,
+            speech_encoder_attention_heads=self.num_heads,
+            unit_hifigan_vocab_vise=self.t2u_vocab_size,
+            vocoder_num_spkrs=self.vocoder_num_spkrs,
+            vocoder_num_langs=self.vocoder_num_langs,
+            upsample_initial_channel=self.upsample_initial_channel,
+            unit_embed_dim=self.unit_embed_dim,
+            spkr_embed_dim=self.spkr_embed_dim,
+            num_conv_pos_embeddings=self.num_conv_pos_embeddings,
+            lang_embed_dim=self.lang_embed_dim,
+            max_new_tokens=self.max_new_tokens,
+            unit_hifi_gan_vocab_size=self.unit_hifi_gan_vocab_size,
+            t2u_num_langs=self.t2u_num_langs,
+            t2u_offset_tgt_lang=self.t2u_offset_tgt_lang,
+            vocoder_offset=self.vocoder_offset,
+            t2u_variance_predictor_embed_dim=self.hidden_size,
+            t2u_variance_predictor_hidden_dim=self.t2u_variance_predictor_hidden_dim,
+            char_vocab_size=self.char_vocab_size,
+            left_max_position_embeddings=self.left_max_position_embeddings,
+            right_max_position_embeddings=self.right_max_position_embeddings,
+            speech_encoder_chunk_size=self.speech_encoder_chunk_size,
+            speech_encoder_left_chunk_num=self.speech_encoder_left_chunk_num,
+        )
+
+    def prepare_config_and_inputs_for_decoder(self):
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            input_mask,
+            lm_labels,
+        ) = self.prepare_config_and_inputs()
+
+        config.is_decoder = True
+
+        encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+        encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+        return (
+            config,
+            input_ids,
+            decoder_input_ids,
+            input_mask,
+            lm_labels,
+            encoder_hidden_states,
+            encoder_attention_mask,
+        )
+
+    def create_and_check_model(self, config, input_ids, decoder_input_ids, input_mask, labels):
+        model = SeamlessM4Tv2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+        if self.input_modality == "text":
+            result = model(input_ids=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
+            result = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+        else:
+            result = model(input_features=input_ids, attention_mask=input_mask, decoder_input_ids=decoder_input_ids)
+            result = model(input_features=input_ids, decoder_input_ids=decoder_input_ids)
+            self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+        decoder_output = result.logits
+        decoder_past = result.past_key_values
+        encoder_output = result.encoder_last_hidden_state
+
+        if self.input_modality == "text":
+            seq_length = self.seq_length
+        else:
+            # if speech, expected length has been subsampled.
+            seq_length = model._compute_sub_sample_lengths_from_attention_mask(input_mask).max().item()
+
+        self.parent.assertEqual(encoder_output.size(), (self.batch_size, seq_length, self.hidden_size))
+        self.parent.assertEqual(decoder_output.size(), (self.batch_size, decoder_input_ids.shape[1], self.vocab_size))
+        # There should be `num_layers` key value embeddings stored in decoder_past
+        self.parent.assertEqual(len(decoder_past), config.decoder_layers)
+        # There should be a self attn key, a self attn value, a cross attn key and a cross attn value stored in each decoder_past tuple
+        self.parent.assertEqual(len(decoder_past[0]), 4)
+
+    def create_and_check_decoder_model_past_large_inputs(
+        self,
+        config,
+        input_ids,
+        decoder_input_ids,
+        input_mask,
+        lm_labels,
+        encoder_hidden_states,
+        encoder_attention_mask,
+    ):
+        config.is_decoder = True
+        model = SeamlessM4Tv2Model(config=config)
+        model.to(torch_device)
+        model.eval()
+
+        # make sure no pad token in decoder_input_ids
+        decoder_input_ids = torch.clamp(decoder_input_ids, config.pad_token_id + 1)
+
+        # first forward pass
+        outputs = model(
+            input_ids, decoder_input_ids=decoder_input_ids, decoder_attention_mask=input_mask, use_cache=True
+        )
+        past_key_values = outputs.past_key_values
+
+        # create hypothetical multiple next token and extent to next_input_ids
+        next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size)
+        next_mask = ids_tensor((self.batch_size, 3), vocab_size=2)
+
+        # append to next input_ids and
+        next_input_ids = torch.cat([decoder_input_ids, next_tokens], dim=-1)
+        next_attention_mask = torch.cat([input_mask, next_mask], dim=-1)
+
+        output_from_no_past = model(
+            input_ids,
+            decoder_input_ids=next_input_ids,
+            decoder_attention_mask=next_attention_mask,
+            output_hidden_states=True,
+        )
+        output_from_no_past = output_from_no_past["decoder_hidden_states"][0]
+        output_from_past = model(
+            input_ids,
+            decoder_input_ids=next_tokens,
+            decoder_attention_mask=next_attention_mask,
+            past_key_values=past_key_values,
+            output_hidden_states=True,
+        )["decoder_hidden_states"][0]
+
+        # select random slice
+        random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item()
+        output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach()
+        output_from_past_slice = output_from_past[:, :, random_slice_idx].detach()
+
+        self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1])
+
+        # test that outputs are equal for slice
+        self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        (
+            config,
+            input_ids,
+            decoder_input_ids,
+            input_mask,
+            lm_labels,
+        ) = config_and_inputs
+
+        input_name = "input_ids" if self.input_modality == "text" else "input_features"
+
+        inputs_dict = {
+            input_name: input_ids,
+            "attention_mask": input_mask,
+            "decoder_input_ids": decoder_input_ids,
+            "labels": lm_labels,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+class SeamlessM4Tv2ModelWithSpeechInputTest(ModelTesterMixin, unittest.TestCase):
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    test_resize_embeddings = False
+    test_headmasking = False
+    test_torchscript = False
+
+    all_model_classes = (
+        (
+            SeamlessM4Tv2Model,
+            SeamlessM4Tv2ForSpeechToSpeech,
+            SeamlessM4Tv2ForSpeechToText,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (SeamlessM4Tv2ForSpeechToText,) if is_torch_available() else ()
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
+        self.config_tester = ConfigTester(self, config_class=SeamlessM4Tv2Config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = SeamlessM4Tv2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def _get_input_ids_and_config(self, batch_size=2):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        input_ids = inputs_dict[self.input_name]
+
+        # cut to half length & take max batch_size 3
+        sequence_length = input_ids.shape[-1] // 2
+        input_ids = input_ids[:batch_size, :sequence_length]
+
+        # generate max 3 tokens
+        max_length = input_ids.shape[-1] + 3
+        if config.eos_token_id is not None and config.pad_token_id is None:
+            # hack to allow generate for models such as GPT2 as is done in `generate()`
+            if isinstance(config.eos_token_id, int):
+                config.eos_token_id = [config.eos_token_id]
+            config.pad_token_id = config.eos_token_id[0]
+
+        attention_mask = torch.ones(input_ids.shape[:2], dtype=torch.long)[:batch_size, :sequence_length]
+
+        return config, input_ids.float(), attention_mask, max_length
+
+    @staticmethod
+    def _get_encoder_outputs(
+        model, input_ids, attention_mask, output_attentions=None, output_hidden_states=None, num_interleave=1
+    ):
+        encoder = model.get_encoder()
+        encoder_outputs = encoder(
+            input_ids,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        encoder_outputs["last_hidden_state"] = encoder_outputs.last_hidden_state.repeat_interleave(
+            num_interleave, dim=0
+        )
+        input_ids = (
+            torch.zeros(input_ids.shape[:2], dtype=torch.int64, layout=input_ids.layout, device=input_ids.device)
+            + model._get_decoder_start_token_id()
+        )
+        attention_mask = None
+        return encoder_outputs, input_ids, attention_mask
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "pos_bias_v",
+                    "pos_bias_u",
+                    "pointwise_conv1",
+                    "pointwise_conv2",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                    "adapter",
+                ]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    @unittest.skip(reason="SeamlessM4Tv2SpeechEncoder doesn't have an embedding layer")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(
+        reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained."
+    )
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
+    @unittest.skip(
+        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
+    )
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="SeamlessM4Tv2Model can takes input_ids or input_features")
+    def test_forward_signature(self):
+        pass
+
+    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+    def test_attention_outputs(self):
+        # expected length is subsampled so need to change a bit this test
+        if not self.has_attentions:
+            self.skipTest(reason="Model does not output attentions")
+
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+        config.return_dict = True
+
+        seq_len = getattr(self.model_tester, "seq_length", None)
+        decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len)
+        encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
+        decoder_key_length = getattr(self.model_tester, "decoder_key_length", decoder_seq_length)
+        encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
+        # no more chunk_length test
+
+        for model_class in self.all_model_classes:
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = False
+            config.return_dict = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            # check that output_attentions also work using config
+            del inputs_dict["output_attentions"]
+            config.output_attentions = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+            attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+            self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+
+            self.assertListEqual(
+                list(attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+            out_len = len(outputs)
+
+            if self.is_encoder_decoder:
+                correct_outlen = 5
+
+                # loss is at first position
+                if "labels" in inputs_dict:
+                    correct_outlen += 1  # loss is added to beginning
+                if "past_key_values" in outputs:
+                    correct_outlen += 1  # past_key_values have been returned
+
+                self.assertEqual(out_len, correct_outlen)
+
+                # decoder attentions
+                decoder_attentions = outputs.decoder_attentions
+                self.assertIsInstance(decoder_attentions, (list, tuple))
+                self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                self.assertListEqual(
+                    list(decoder_attentions[0].shape[-3:]),
+                    [self.model_tester.num_attention_heads, decoder_seq_length, decoder_key_length],
+                )
+
+                # cross attentions
+                cross_attentions = outputs.cross_attentions
+                self.assertIsInstance(cross_attentions, (list, tuple))
+                self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+
+                sub_sampled_length = (
+                    model._compute_sub_sample_lengths_from_attention_mask(inputs_dict["attention_mask"]).max().item()
+                )
+                self.assertListEqual(
+                    list(cross_attentions[0].shape[-3:]),
+                    [
+                        self.model_tester.num_attention_heads,
+                        decoder_seq_length,
+                        sub_sampled_length,
+                    ],
+                )
+
+            # Check attention is always last and order is fine
+            inputs_dict["output_attentions"] = True
+            inputs_dict["output_hidden_states"] = True
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+            with torch.no_grad():
+                outputs = model(**self._prepare_for_class(inputs_dict, model_class))
+
+            if hasattr(self.model_tester, "num_hidden_states_types"):
+                added_hidden_states = self.model_tester.num_hidden_states_types
+            elif self.is_encoder_decoder:
+                added_hidden_states = 2
+            else:
+                added_hidden_states = 1
+            self.assertEqual(out_len + added_hidden_states, len(outputs))
+
+            self_attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
+
+            self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+            self.assertListEqual(
+                list(self_attentions[0].shape[-3:]),
+                [self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
+            )
+
+
+@require_torch
+class SeamlessM4Tv2ModelWithTextInputTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
+    is_encoder_decoder = True
+    fx_compatible = False
+    test_missing_keys = False
+    test_pruning = False
+    test_model_parallel = False
+    test_resize_embeddings = True
+    test_headmasking = False
+    test_torchscript = False
+
+    all_model_classes = (
+        (
+            SeamlessM4Tv2Model,
+            SeamlessM4Tv2ForTextToSpeech,
+            SeamlessM4Tv2ForTextToText,
+        )
+        if is_torch_available()
+        else ()
+    )
+    all_generative_model_classes = (SeamlessM4Tv2ForTextToText,) if is_torch_available() else ()
+
+    def setUp(self):
+        self.model_tester = SeamlessM4Tv2ModelTester(self, input_modality="text")
+        self.config_tester = ConfigTester(self, config_class=SeamlessM4Tv2Config)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @slow
+    def test_model_from_pretrained(self):
+        for model_name in SEAMLESS_M4T_V2_PRETRAINED_MODEL_ARCHIVE_LIST:
+            model = SeamlessM4Tv2Model.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                uniform_init_parms = [
+                    "conv",
+                    "masked_spec_embed",
+                    "codevectors",
+                    "quantizer.weight_proj.weight",
+                    "project_hid.weight",
+                    "project_hid.bias",
+                    "project_q.weight",
+                    "project_q.bias",
+                    "pos_bias_v",
+                    "pos_bias_u",
+                    "pointwise_conv1",
+                    "pointwise_conv2",
+                    "feature_projection.projection.weight",
+                    "feature_projection.projection.bias",
+                    "objective.weight",
+                    "adapter",
+                ]
+                if param.requires_grad:
+                    if any(x in name for x in uniform_init_parms):
+                        self.assertTrue(
+                            -1.0 <= ((param.data.mean() * 1e9).round() / 1e9).item() <= 1.0,
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+                    else:
+                        self.assertIn(
+                            ((param.data.mean() * 1e9).round() / 1e9).item(),
+                            [0.0, 1.0],
+                            msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                        )
+
+    @unittest.skip(
+        reason="Expected missing keys serve when using SeamlessM4Tv2ForXXX.from_pretrained from a checkpoint saved by SeamlessM4Tv2Model.save_pretrained."
+    )
+    def test_model_weights_reload_no_missing_tied_weights(self):
+        pass
+
+    @unittest.skip(reason="SeamlessM4Tv2Model can take input_ids or input_features")
+    def test_forward_signature(self):
+        pass
+
+    def test_decoder_model_past_with_large_inputs(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs)
+
+    @unittest.skip(
+        reason="SeamlessM4Tv2Model is base class but has actually a bigger architecture than seamlessM4T task-specific models."
+    )
+    def test_save_load_fast_init_to_base(self):
+        pass
+
+    @unittest.skip(reason="SeamlessM4Tv2 has no base model")
+    def test_save_load_fast_init_from_base(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+
+@require_torch
+class SeamlessM4Tv2GenerationTest(unittest.TestCase):
+    # test that non-standard generation works
+    # test generation of: SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToSpeech, SeamlessM4Tv2ForSpeechToText, SeamlessM4Tv2ForTextToSpeech
+
+    def setUp(self):
+        self.speech_model_tester = SeamlessM4Tv2ModelTester(self, input_modality="speech")
+        self.text_model_tester = SeamlessM4Tv2ModelTester(self, input_modality="text")
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def update_generation(self, model):
+        lang_code_to_id = {
+            "fra": 4,
+            "eng": 4,
+        }
+
+        id_to_text = {str(i): "a" for i in range(model.config.vocab_size)}
+        id_to_text["0"] = "ab"
+        id_to_text["1"] = "_b"
+        id_to_text["3"] = ","
+        id_to_text["4"] = "_cd"
+
+        char_to_id = {char: i for (i, char) in enumerate("abcd")}
+
+        generation_config = copy.deepcopy(model.generation_config)
+
+        generation_config.__setattr__("text_decoder_lang_to_code_id", lang_code_to_id)
+        generation_config.__setattr__("t2u_lang_code_to_id", lang_code_to_id)
+        generation_config.__setattr__("vocoder_lang_code_to_id", lang_code_to_id)
+        generation_config.__setattr__("id_to_text", id_to_text)
+        generation_config.__setattr__("char_to_id", char_to_id)
+        generation_config.__setattr__("eos_token_id", 0)
+
+        generation_config._from_model_config = False
+
+        model.generation_config = generation_config
+
+    def prepare_text_input(self):
+        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
+
+        input_dict = {
+            "input_ids": inputs,
+            "attention_mask": input_mask,
+            "tgt_lang": "eng",
+            "num_beams": 2,
+            "do_sample": True,
+        }
+
+        return config, input_dict
+
+    def prepare_speech_input(self):
+        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
+
+        input_dict = {
+            "input_features": inputs,
+            "attention_mask": input_mask,
+            "tgt_lang": "fra",
+            "num_beams": 2,
+            "do_sample": True,
+        }
+
+        return config, input_dict
+
+    def prepare_speech_and_text_input(self):
+        config, inputs, decoder_input_ids, input_mask, lm_labels = self.speech_model_tester.prepare_config_and_inputs()
+
+        input_speech = {
+            "input_features": inputs,
+            "attention_mask": input_mask,
+            "tgt_lang": "fra",
+            "num_beams": 2,
+            "do_sample": True,
+        }
+
+        config, inputs, decoder_input_ids, input_mask, lm_labels = self.text_model_tester.prepare_config_and_inputs()
+
+        input_text = {
+            "input_ids": inputs,
+            "attention_mask": input_mask,
+            "tgt_lang": "eng",
+            "num_beams": 2,
+            "do_sample": True,
+        }
+        return config, input_speech, input_text
+
+    def factory_generation_speech_test(self, model, inputs):
+        set_seed(0)
+        output = model.generate(**inputs)
+        return output
+
+    def test_speech_generation(self):
+        config, input_speech, input_text = self.prepare_speech_and_text_input()
+
+        model = SeamlessM4Tv2Model(config=config)
+        self.update_generation(model)
+        model.save_pretrained(self.tmpdirname)
+        model.to(torch_device)
+        model.eval()
+
+        output_original_text = self.factory_generation_speech_test(model, input_text)
+        output_original_speech = self.factory_generation_speech_test(model, input_speech)
+
+        state_dict = model.state_dict()
+
+        text_model = SeamlessM4Tv2ForTextToSpeech.from_pretrained(self.tmpdirname)
+        self.update_generation(text_model)
+        text_model.to(torch_device)
+        text_model.eval()
+
+        output_text = self.factory_generation_speech_test(model, input_text)
+
+        speech_model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained(self.tmpdirname)
+        self.update_generation(speech_model)
+        speech_model.to(torch_device)
+        speech_model.eval()
+
+        for name, tensor in speech_model.state_dict().items():
+            right_tensor = state_dict.get(name)
+            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
+
+        output_speech = self.factory_generation_speech_test(model, input_speech)
+
+        # test same text output from input text
+        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text[0].ravel().tolist())
+        self.assertListEqual(output_original_text[1].ravel().tolist(), output_text[1].ravel().tolist())
+
+        # test same speech output from input text
+        # assertTrue because super long list makes this hang in case of failure
+        self.assertTrue(
+            output_original_speech[0].ravel().tolist() == output_speech[0].ravel().tolist(),
+            "Speech generated was different",
+        )
+        self.assertTrue(
+            output_original_speech[1].ravel().tolist() == output_speech[1].ravel().tolist(),
+            "Speech generated was different",
+        )
+
+    def test_text_generation(self):
+        config, input_speech, input_text = self.prepare_speech_and_text_input()
+
+        # to return speech
+        input_speech["generate_speech"] = False
+        input_text["generate_speech"] = False
+
+        model = SeamlessM4Tv2Model(config=config)
+        self.update_generation(model)
+        model.save_pretrained(self.tmpdirname)
+        model.to(torch_device)
+        model.eval()
+
+        output_original_text = self.factory_generation_speech_test(model, input_text)
+        output_original_speech = self.factory_generation_speech_test(model, input_speech)
+
+        # other models don't need it
+        input_speech.pop("generate_speech")
+        input_text.pop("generate_speech")
+
+        state_dict = model.state_dict()
+
+        text_model = SeamlessM4Tv2ForTextToText.from_pretrained(self.tmpdirname)
+        self.update_generation(text_model)
+        text_model.to(torch_device)
+        text_model.eval()
+
+        for name, tensor in text_model.state_dict().items():
+            right_tensor = state_dict.get(name)
+            self.assertEqual(tensor.tolist(), right_tensor.tolist())
+
+        output_text = self.factory_generation_speech_test(text_model, input_text)
+
+        speech_model = SeamlessM4Tv2ForSpeechToText.from_pretrained(self.tmpdirname)
+
+        for name, tensor in speech_model.state_dict().items():
+            right_tensor = state_dict.get(name)
+            self.assertEqual(tensor.tolist(), right_tensor.tolist(), f"Tensor {name}")
+
+        self.update_generation(speech_model)
+        speech_model.to(torch_device)
+        speech_model.eval()
+
+        output_speech = self.factory_generation_speech_test(speech_model, input_speech)
+
+        # test same text output from input text
+        self.assertListEqual(output_original_text[0].ravel().tolist(), output_text.ravel().tolist())
+
+        # test same speech output from input text
+        self.assertListEqual(output_original_speech[0].ravel().tolist(), output_speech.ravel().tolist())
+
+    def test_generation(self):
+        config, input_speech, input_text = self.prepare_speech_and_text_input()
+
+        input_speech["num_beams"] = 3
+        input_speech["do_sample"] = True
+        input_speech["temperature"] = 0.5
+        input_speech["num_return_sequences"] = 3
+
+        input_text["num_beams"] = 3
+        input_text["do_sample"] = True
+        input_text["temperature"] = 0.5
+        input_text["num_return_sequences"] = 3
+
+        for model_class in [SeamlessM4Tv2ForSpeechToSpeech, SeamlessM4Tv2ForSpeechToText, SeamlessM4Tv2Model]:
+            model = model_class(config=config)
+            self.update_generation(model)
+            model.to(torch_device)
+            model.eval()
+
+            output = model.generate(**input_speech)
+            output = output[0] if isinstance(output, tuple) else output
+
+            self.assertEqual(output.shape[0], 3 * input_speech["input_features"].shape[0])
+
+        for model_class in [SeamlessM4Tv2ForTextToSpeech, SeamlessM4Tv2ForTextToText, SeamlessM4Tv2Model]:
+            model = model_class(config=config)
+            self.update_generation(model)
+            model.to(torch_device)
+            model.eval()
+
+            output = model.generate(**input_text)
+
+            output = output[0] if isinstance(output, tuple) else output
+
+            self.assertEqual(output.shape[0], 3 * input_text["input_ids"].shape[0])
+
+
+@require_torch
+class SeamlessM4Tv2ModelIntegrationTest(unittest.TestCase):
+    repo_id = "facebook/seamless-m4t-v2-large"
+
+    def assertListAlmostEqual(self, list1, list2, tol=1e-4):
+        self.assertEqual(len(list1), len(list2))
+        for a, b in zip(list1, list2):
+            self.assertAlmostEqual(a, b, delta=tol)
+
+    @cached_property
+    def processor(self):
+        return SeamlessM4TProcessor.from_pretrained(self.repo_id)
+
+    @cached_property
+    def input_text(self):
+        # corresponds to "C'est un test." with seamlessM4T_medium checkpoint
+
+        input_ids = torch.tensor([[256026, 109, 247729, 171, 128, 6816, 247676, 3]])  # fmt: skip
+
+        input_ids = input_ids.to(torch_device)
+
+        attention_mask = torch.ones_like(input_ids).to(torch_device)
+
+        inputs = {
+            "attention_mask": attention_mask,
+            "input_ids": input_ids,
+        }
+
+        return inputs
+
+    @cached_property
+    def input_audio(self):
+        set_seed(0)
+        seq_len = 20000
+        sampling_rate = 16000
+        input_features = torch.rand((2, seq_len))
+
+        return self.processor(audios=[input_features.tolist()], sampling_rate=sampling_rate, return_tensors="pt").to(
+            torch_device
+        )
+
+    def factory_test_task(self, class1, class2, inputs, class1_kwargs, class2_kwargs):
+        # half-precision loading to limit GPU usage
+        model1 = class1.from_pretrained(self.repo_id, torch_dtype=torch.float16).to(torch_device)
+        model2 = class2.from_pretrained(self.repo_id, torch_dtype=torch.float16).to(torch_device)
+
+        set_seed(0)
+        output_1 = model1.generate(**inputs, **class1_kwargs)
+        set_seed(0)
+        output_2 = model2.generate(**inputs, **class2_kwargs)
+
+        for key in output_1:
+            if isinstance(output_1[key], torch.Tensor):
+                if len(output_1[key].shape) == 0:
+                    self.assertEqual(output_1[key].item(), output_2[key].item())
+                else:
+                    self.assertListAlmostEqual(output_1[key].squeeze().tolist(), output_2[key].squeeze().tolist())
+
+    @slow
+    def test_to_eng_text(self):
+        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id).to(torch_device)
+
+        # test text - tgt lang: eng
+
+        expected_text_tokens = [3, 256022, 3080, 1, 247669, 10, 6816, 247676, 3]  # fmt: skip
+
+        # fmt: off
+        expected_unit_tokens = [
+            4746,7163,8208,8208,1315,1266,4307,1119,989,9594,3007,3007,4341,5205,7631,7631,3202,4061,9092,3191,7509,1715,
+            5280,5280,3554,8812,8197,6366,5382,5382,7330,2758,9433,9433,6863,7510,5800,5800,5286,1948,1825,1825,3956,8724,
+            8724,5331,8914,9315,9315,5288,2588,8167,8787,8787,8063,6008,2621,2621,2621,5696
+        ]
+        # fmt: on
+
+        expected_wav_slice = [9.485097e-04, 8.320558e-04, 7.178137e-04, 9.349979e-04, 1.121628e-03, 1.091766e-03, 1.279693e-03, 1.387754e-03, 1.296396e-03, 1.143557e-03]  # fmt: skip
+
+        set_seed(0)
+        output = model.generate(**self.input_text, num_beams=1, tgt_lang="eng", return_intermediate_token_ids=True)
+
+        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
+        self.assertListEqual(
+            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
+        )
+
+        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
+
+        # assert mean and std equality
+        self.assertListAlmostEqual(
+            [-2.349690e-04, 9.920777e-02], [output.waveform.mean().item(), output.waveform.std().item()]
+        )
+
+    @slow
+    @unittest.skip(reason="Equivalence is broken since a new update")
+    def test_to_swh_text(self):
+        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id).to(torch_device)
+
+        # test text - tgt lang: swh
+
+        expected_text_tokens = [3, 256084, 109, 247729, 171, 10, 6816, 247676, 3]  # fmt: skip
+
+        # fmt: off
+        expected_unit_tokens = [
+            5725,7163,7472,7472,6915,3099,3099,9921,2765,6515,6515,1374,1374,1347,8252,9854,9854,5662,2420,6600,2216,4503,
+            7208,6107,6107,7298,9123,6472,9663,9663,6366,6366,6445,575,3575,2052,2052,5788,5800,5800,5286,5286,1825,1825,3956,
+            3956,8724,8724,5331,8914,8914,9315,9315,2821,8167,8167,8787,8787,8787,8700,8700,8700,2175,2175,3196,3196,2621,1725,
+            1725,7507,5696
+        ]
+        # fmt: on
+
+        expected_wav_slice = [3.124037e-04, 2.450471e-04, 2.286572e-04, 2.317214e-04, 2.732605e-04, 2.478790e-04, 2.704144e-04, 2.665847e-04, 2.828784e-04, 2.684390e-04]  # fmt: skip
+
+        set_seed(0)
+        output = model.generate(**self.input_text, num_beams=1, tgt_lang="swh", return_intermediate_token_ids=True)
+
+        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
+        self.assertListEqual(
+            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
+        )
+
+        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
+
+        # assert mean and std equality
+        self.assertListAlmostEqual(
+            [-2.001826e-04, 8.580012e-02], [output.waveform.mean().item(), output.waveform.std().item()]
+        )
+
+    @slow
+    def test_to_rus_speech(self):
+        model = SeamlessM4Tv2Model.from_pretrained(self.repo_id).to(torch_device)
+
+        # test audio - tgt lang: rus
+
+        expected_text_tokens = [3, 256074, 107, 248213, 404, 247792, 247789, 3]  # fmt: skip
+
+        # fmt: off
+        expected_unit_tokens = [
+            8976,7163,6915,2728,2728,5198,3318,3318,3686,1049,9643,1200,2052,2052,8196,8196,7624,7624,7555,7555,7555,7555,
+            9717,9717,4869,8167,8167,8167,8053,972,9362,8167,297,297,297,3993,3993,3993,3993,4660,4660,4660,4660,4660,4660,
+            7962,7962,225,225,8737,4199
+        ]
+        # fmt: on
+
+        expected_wav_slice = [1.415287e-03, 1.360976e-03, 1.297727e-03, 1.305321e-03, 1.352087e-03, 1.283812e-03, 1.352623e-03, 1.387384e-03, 1.449627e-03, 1.411701e-03]  # fmt: skip
+
+        set_seed(0)
+        output = model.generate(**self.input_audio, num_beams=1, tgt_lang="rus", return_intermediate_token_ids=True)
+
+        self.assertListEqual(expected_text_tokens, output.sequences.squeeze().tolist())
+        self.assertListEqual(
+            expected_unit_tokens, (output.unit_sequences - model.config.vocoder_offset).squeeze().tolist()
+        )
+
+        self.assertListAlmostEqual(expected_wav_slice, output.waveform.squeeze().tolist()[50:60])
+
+        # assert mean and std equality - higher tolerance for speech
+        self.assertListAlmostEqual(
+            [-2.818016e-04, 7.169888e-02], [output.waveform.mean().item(), output.waveform.std().item()], tol=5e-4
+        )
+
+    @slow
+    def test_text_to_text_model(self):
+        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
+        kwargs2 = {
+            "tgt_lang": "eng",
+            "output_hidden_states": True,
+            "return_dict_in_generate": True,
+            "output_scores": True,
+        }
+        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForTextToText, self.input_text, kwargs1, kwargs2)
+
+    @slow
+    def test_speech_to_text_model(self):
+        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True, "generate_speech": False}
+        kwargs2 = {
+            "tgt_lang": "eng",
+            "output_hidden_states": True,
+            "return_dict_in_generate": True,
+            "output_scores": True,
+        }
+        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToText, self.input_audio, kwargs1, kwargs2)
+
+    @slow
+    def test_speech_to_speech_model(self):
+        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
+        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForSpeechToSpeech, self.input_audio, kwargs1, kwargs1)
+
+    @slow
+    def test_text_to_speech_model(self):
+        kwargs1 = {"tgt_lang": "eng", "return_intermediate_token_ids": True}
+
+        self.factory_test_task(SeamlessM4Tv2Model, SeamlessM4Tv2ForTextToSpeech, self.input_text, kwargs1, kwargs1)
diff --git a/tests/models/segformer/test_modeling_segformer.py b/tests/models/segformer/test_modeling_segformer.py
index 0506be9b1f1115..d9a4dce9ffeb3c 100644
--- a/tests/models/segformer/test_modeling_segformer.py
+++ b/tests/models/segformer/test_modeling_segformer.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch SegFormer model. """
 
 
-import inspect
 import unittest
 
 from transformers import SegformerConfig, is_torch_available, is_vision_available
@@ -212,18 +211,6 @@ def test_inputs_embeds(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
index b2220a9e74d8a3..cbb449c6e7ae73 100644
--- a/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
+++ b/tests/models/speech_to_text_2/test_modeling_speech_to_text_2.py
@@ -196,10 +196,6 @@ def setUp(
     def test_inputs_embeds(self):
         pass
 
-    @unittest.skip("This test is currently broken because of safetensors.")
-    def test_tf_from_pt_safetensors(self):
-        pass
-
     # speech2text2 has no base model
     def test_save_load_fast_init_from_base(self):
         pass
diff --git a/tests/models/swiftformer/test_modeling_swiftformer.py b/tests/models/swiftformer/test_modeling_swiftformer.py
index 3e286cc32048b3..83b6aa3510d925 100644
--- a/tests/models/swiftformer/test_modeling_swiftformer.py
+++ b/tests/models/swiftformer/test_modeling_swiftformer.py
@@ -16,7 +16,6 @@
 
 
 import copy
-import inspect
 import unittest
 
 from transformers import PretrainedConfig, SwiftFormerConfig
@@ -177,18 +176,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/swin/test_modeling_swin.py b/tests/models/swin/test_modeling_swin.py
index 383f0fe867d4fb..e82c13f8db2744 100644
--- a/tests/models/swin/test_modeling_swin.py
+++ b/tests/models/swin/test_modeling_swin.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch Swin model. """
 
 import collections
-import inspect
 import unittest
 
 from transformers import SwinConfig
@@ -300,18 +299,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/swin2sr/test_modeling_swin2sr.py b/tests/models/swin2sr/test_modeling_swin2sr.py
index f94e11ad646095..581e8debc7e7d7 100644
--- a/tests/models/swin2sr/test_modeling_swin2sr.py
+++ b/tests/models/swin2sr/test_modeling_swin2sr.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Testing suite for the PyTorch Swin2SR model. """
-import inspect
 import unittest
 
 from transformers import Swin2SRConfig
@@ -162,7 +161,11 @@ def prepare_config_and_inputs_for_common(self):
 @require_torch
 class Swin2SRModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
     all_model_classes = (Swin2SRModel, Swin2SRForImageSuperResolution) if is_torch_available() else ()
-    pipeline_model_mapping = {"feature-extraction": Swin2SRModel} if is_torch_available() else {}
+    pipeline_model_mapping = (
+        {"feature-extraction": Swin2SRModel, "image-to-image": Swin2SRForImageSuperResolution}
+        if is_torch_available()
+        else {}
+    )
 
     fx_compatible = False
     test_pruning = False
@@ -228,18 +231,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     @slow
     def test_model_from_pretrained(self):
         for model_name in SWIN2SR_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
diff --git a/tests/models/swinv2/test_modeling_swinv2.py b/tests/models/swinv2/test_modeling_swinv2.py
index 5a771b2c4b6353..9b9d08b39fd1b9 100644
--- a/tests/models/swinv2/test_modeling_swinv2.py
+++ b/tests/models/swinv2/test_modeling_swinv2.py
@@ -14,7 +14,6 @@
 # limitations under the License.
 """ Testing suite for the PyTorch Swinv2 model. """
 import collections
-import inspect
 import unittest
 
 from transformers import Swinv2Config
@@ -220,18 +219,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_attention_outputs(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         config.return_dict = True
diff --git a/tests/models/switch_transformers/test_modeling_switch_transformers.py b/tests/models/switch_transformers/test_modeling_switch_transformers.py
index 5458b566667903..aa226f82ae3606 100644
--- a/tests/models/switch_transformers/test_modeling_switch_transformers.py
+++ b/tests/models/switch_transformers/test_modeling_switch_transformers.py
@@ -726,10 +726,6 @@ def test_generate_with_head_masking(self):
     def test_disk_offload(self):
         pass
 
-    @unittest.skip("Test does not fail individually but fails on the CI @ArthurZucker looking into it")
-    def test_assisted_decoding_sample(self):
-        pass
-
 
 class SwitchTransformersEncoderOnlyModelTester:
     def __init__(
diff --git a/tests/models/t5/test_modeling_t5.py b/tests/models/t5/test_modeling_t5.py
index fe098304735931..68b9f45e155b53 100644
--- a/tests/models/t5/test_modeling_t5.py
+++ b/tests/models/t5/test_modeling_t5.py
@@ -1036,10 +1036,6 @@ def test_model_fp16_forward(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs)
 
-    @unittest.skip("Test does not fail individually but fails on the CI @ArthurZucker looking into it")
-    def test_assisted_decoding_sample(self):
-        pass
-
 
 def use_task_specific_params(model, task):
     model.config.update(model.config.task_specific_params[task])
diff --git a/tests/models/timesformer/test_modeling_timesformer.py b/tests/models/timesformer/test_modeling_timesformer.py
index 2b7a5e279fe7ad..d4e71c8c599967 100644
--- a/tests/models/timesformer/test_modeling_timesformer.py
+++ b/tests/models/timesformer/test_modeling_timesformer.py
@@ -16,7 +16,6 @@
 
 
 import copy
-import inspect
 import unittest
 
 import numpy as np
@@ -204,18 +203,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py b/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
deleted file mode 100644
index fdbff90b24b0ca..00000000000000
--- a/tests/models/transfo_xl/test_modeling_tf_transfo_xl.py
+++ /dev/null
@@ -1,282 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from __future__ import annotations
-
-import random
-import unittest
-
-from transformers import TransfoXLConfig, is_tf_available
-from transformers.testing_utils import require_tf, slow
-
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_tf_common import TFModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-    from transformers import (
-        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
-        TFTransfoXLForSequenceClassification,
-        TFTransfoXLLMHeadModel,
-        TFTransfoXLModel,
-    )
-
-
-class TFTransfoXLModelTester:
-    def __init__(
-        self,
-        parent,
-    ):
-        self.parent = parent
-        self.batch_size = 13
-        self.seq_length = 7
-        self.mem_len = 30
-        self.key_length = self.seq_length + self.mem_len
-        self.clamp_len = 15
-        self.is_training = True
-        self.use_labels = True
-        self.vocab_size = 99
-        self.cutoffs = [10, 50, 80]
-        self.hidden_size = 32
-        self.d_embed = 32
-        self.num_attention_heads = 4
-        self.d_head = 8
-        self.d_inner = 128
-        self.div_val = 2
-        self.num_hidden_layers = 2
-        self.scope = None
-        self.seed = 1
-        self.eos_token_id = 0
-        self.num_labels = 3
-        self.pad_token_id = self.vocab_size - 1
-        self.init_range = 0.01
-
-    def prepare_config_and_inputs(self):
-        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = TransfoXLConfig(
-            vocab_size=self.vocab_size,
-            mem_len=self.mem_len,
-            clamp_len=self.clamp_len,
-            cutoffs=self.cutoffs,
-            d_model=self.hidden_size,
-            d_embed=self.d_embed,
-            n_head=self.num_attention_heads,
-            d_head=self.d_head,
-            d_inner=self.d_inner,
-            div_val=self.div_val,
-            n_layer=self.num_hidden_layers,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.vocab_size - 1,
-            init_range=self.init_range,
-            num_labels=self.num_labels,
-        )
-
-        return (config, input_ids_1, input_ids_2, lm_labels)
-
-    def set_seed(self):
-        random.seed(self.seed)
-        tf.random.set_seed(self.seed)
-
-    def create_and_check_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
-        model = TFTransfoXLModel(config)
-
-        hidden_states_1, mems_1 = model(input_ids_1).to_tuple()
-
-        inputs = {"input_ids": input_ids_2, "mems": mems_1}
-
-        hidden_states_2, mems_2 = model(inputs).to_tuple()
-
-        self.parent.assertEqual(hidden_states_1.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(hidden_states_2.shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_1],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_2],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
-        model = TFTransfoXLLMHeadModel(config)
-
-        lm_logits_1, mems_1 = model(input_ids_1).to_tuple()
-
-        inputs = {"input_ids": input_ids_1, "labels": lm_labels}
-        _, mems_1 = model(inputs).to_tuple()
-
-        lm_logits_2, mems_2 = model([input_ids_2, mems_1]).to_tuple()
-
-        inputs = {"input_ids": input_ids_1, "mems": mems_1, "labels": lm_labels}
-
-        _, mems_2 = model(inputs).to_tuple()
-
-        self.parent.assertEqual(lm_logits_1.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_1],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-        self.parent.assertEqual(lm_logits_2.shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in mems_2],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels):
-        model = TFTransfoXLForSequenceClassification(config)
-        result = model(input_ids_1)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids_1}
-        return config, inputs_dict
-
-
-@require_tf
-class TFTransfoXLModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TFTransfoXLModel, TFTransfoXLLMHeadModel, TFTransfoXLForSequenceClassification) if is_tf_available() else ()
-    )
-    all_generative_model_classes = () if is_tf_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TFTransfoXLModel,
-            "text-classification": TFTransfoXLForSequenceClassification,
-            "text-generation": TFTransfoXLLMHeadModel,
-            "zero-shot": TFTransfoXLForSequenceClassification,
-        }
-        if is_tf_available()
-        else {}
-    )
-    # TODO: add this test when TFTransfoXLLMHead has a linear output layer implemented
-    test_resize_embeddings = False
-    test_head_masking = False
-    test_onnx = False
-    test_mismatched_shapes = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "TextGenerationPipelineTests":
-            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
-            # `TransfoXLConfig` was never used in pipeline tests: cannot create a simple
-            # tokenizer.
-            return True
-
-        return False
-
-    def setUp(self):
-        self.model_tester = TFTransfoXLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_transfo_xl_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_model(*config_and_inputs)
-
-    def test_transfo_xl_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_lm_head(*config_and_inputs)
-
-    def test_transfo_xl_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_for_sequence_classification(*config_and_inputs)
-
-    def test_model_common_attributes(self):
-        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-        list_other_models_with_output_ebd = [TFTransfoXLForSequenceClassification]
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            assert isinstance(model.get_input_embeddings(), tf.keras.layers.Layer)
-            if model_class in list_other_models_with_output_ebd:
-                x = model.get_output_embeddings()
-                assert isinstance(x, tf.keras.layers.Layer)
-                name = model.get_bias()
-                assert name is None
-            else:
-                x = model.get_output_embeddings()
-                assert x is None
-                name = model.get_bias()
-                assert name is None
-
-    def test_xla_mode(self):
-        # TODO JP: Make TransfoXL XLA compliant
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TFTransfoXLModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    @unittest.skip(reason="This model doesn't play well with fit() due to not returning a single loss.")
-    def test_dataset_conversion(self):
-        pass
-
-
-@require_tf
-class TFTransfoXLModelLanguageGenerationTest(unittest.TestCase):
-    @unittest.skip("Skip test until #12651 is resolved.")
-    @slow
-    def test_lm_generate_transfo_xl_wt103(self):
-        model = TFTransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103")
-        input_ids = tf.convert_to_tensor([[33, 1297, 2, 1, 1009, 4, 1109, 11739, 4762, 358, 5, 25, 245, 22, 1706, 17, 20098, 5, 3215, 21, 37, 1110, 3, 13, 1041, 4, 24, 603, 490, 2, 71477, 20098, 104447, 2, 20961, 1, 2604, 4, 1, 329, 3, 6224, 831, 16002, 2, 8, 603, 78967, 29546, 23, 803, 20, 25, 416, 5, 8, 232, 4, 277, 6, 1855, 4601, 3, 29546, 54, 8, 3609, 5, 57211, 49, 4, 1, 277, 18, 8, 1755, 15691, 3, 341, 25, 416, 693, 42573, 71, 17, 401, 94, 31, 17919, 2, 29546, 7873, 18, 1, 435, 23, 11011, 755, 5, 5167, 3, 7983, 98, 84, 2, 29546, 3267, 8, 3609, 4, 1, 4865, 1075, 2, 6087, 71, 6, 346, 8, 5854, 3, 29546, 824, 1400, 1868, 2, 19, 160, 2, 311, 8, 5496, 2, 20920, 17, 25, 15097, 3, 24, 24, 0]])  # fmt: skip
-        #  In 1991 , the remains of Russian Tsar Nicholas II and his family
-        #  ( except for Alexei and Maria ) are discovered .
-        #  The voice of Nicholas's young son , Tsarevich Alexei Nikolaevich , narrates the
-        #  remainder of the story . 1883 Western Siberia ,
-        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic .
-        #  Rasputin has a vision and denounces one of the men as a horse thief . Although his
-        #  father initially slaps him for making such an accusation , Rasputin watches as the
-        #  man is chased outside and beaten . Twenty years later , Rasputin sees a vision of
-        #  the Virgin Mary , prompting him to become a priest . Rasputin quickly becomes famous ,
-        #  with people , even a bishop , begging for his blessing . <eod> </s> <eos>
-
-        expected_output_ids = [33,1297,2,1,1009,4,1109,11739,4762,358,5,25,245,22,1706,17,20098,5,3215,21,37,1110,3,13,1041,4,24,603,490,2,71477,20098,104447,2,20961,1,2604,4,1,329,3,6224,831,16002,2,8,603,78967,29546,23,803,20,25,416,5,8,232,4,277,6,1855,4601,3,29546,54,8,3609,5,57211,49,4,1,277,18,8,1755,15691,3,341,25,416,693,42573,71,17,401,94,31,17919,2,29546,7873,18,1,435,23,11011,755,5,5167,3,7983,98,84,2,29546,3267,8,3609,4,1,4865,1075,2,6087,71,6,346,8,5854,3,29546,824,1400,1868,2,19,160,2,311,8,5496,2,20920,17,25,15097,3,24,24,0,33,1,142,1298,188,2,29546,113,8,3654,4,1,1109,7136,833,3,13,1645,4,29546,11,104,7,1,1109,532,7129,2,10,83507,2,1162,1123,2,6,7245,10,2,5,11,104,7,1,1109,532,7129,2,10,24,24,10,22,10,13,770,5863,4,7245,10]  # fmt: skip
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family ( except for
-        #  Alexei and Maria ) are discovered. The voice of young son, Tsarevich Alexei
-        #  Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young
-        #  Grigori Rasputin is asked by his father and a group of men to perform magic.
-        #  Rasputin has a vision and denounces one of the men as a horse thief. Although
-        #  his father initially slaps him for making such an accusation, Rasputin watches
-        #  as the man is chased outside and beaten. Twenty years later, Rasputin sees a
-        #  vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly
-        #  becomes famous, with people, even a bishop, begging for his blessing. In the
-        #  early 20th century, Rasputin became a symbol of the Russian Orthodox Church.
-        #  The image of Rasputin was used in the Russian national anthem, " Nearer, My God,
-        #  to Heaven ", and was used in the Russian national anthem, " " ( " The Great Spirit
-        #  of Heaven "
-
-        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
-        self.assertListEqual(output_ids[0].numpy().tolist(), expected_output_ids)
diff --git a/tests/models/transfo_xl/test_modeling_transfo_xl.py b/tests/models/transfo_xl/test_modeling_transfo_xl.py
deleted file mode 100644
index 9534b13c852629..00000000000000
--- a/tests/models/transfo_xl/test_modeling_transfo_xl.py
+++ /dev/null
@@ -1,533 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import copy
-import random
-import unittest
-
-from transformers import TransfoXLConfig, is_torch_available
-from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device
-
-from ...generation.test_utils import GenerationTesterMixin
-from ...test_configuration_common import ConfigTester
-from ...test_modeling_common import ModelTesterMixin, ids_tensor
-from ...test_pipeline_mixin import PipelineTesterMixin
-
-
-if is_torch_available():
-    import torch
-    from torch import nn
-
-    from transformers import TransfoXLForSequenceClassification, TransfoXLLMHeadModel, TransfoXLModel
-    from transformers.models.transfo_xl.modeling_transfo_xl import TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST
-
-
-class TransfoXLModelTester:
-    def __init__(
-        self,
-        parent,
-        batch_size=14,
-        seq_length=7,
-        mem_len=30,
-        clamp_len=15,
-        is_training=False,
-        use_labels=True,
-        vocab_size=99,
-        cutoffs=[10, 50, 80],
-        hidden_size=32,
-        d_embed=32,
-        num_attention_heads=4,
-        d_head=8,
-        d_inner=128,
-        div_val=2,
-        num_hidden_layers=2,
-        scope=None,
-        seed=1,
-        eos_token_id=0,
-        num_labels=3,
-    ):
-        self.parent = parent
-        self.batch_size = batch_size
-        self.seq_length = seq_length
-        self.mem_len = mem_len
-        self.key_length = self.seq_length + self.mem_len
-        self.clamp_len = clamp_len
-        self.is_training = is_training
-        self.use_labels = use_labels
-        self.vocab_size = vocab_size
-        self.cutoffs = cutoffs
-        self.hidden_size = hidden_size
-        self.d_embed = d_embed
-        self.num_attention_heads = num_attention_heads
-        self.d_head = d_head
-        self.d_inner = d_inner
-        self.div_val = div_val
-        self.num_hidden_layers = num_hidden_layers
-        self.scope = scope
-        self.seed = seed
-        self.eos_token_id = eos_token_id
-        self.num_labels = num_labels
-        self.pad_token_id = self.vocab_size - 1
-
-    def prepare_config_and_inputs(self):
-        input_ids_1 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids_2 = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        lm_labels = None
-        if self.use_labels:
-            lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-
-        config = self.get_config()
-
-        return (config, input_ids_1, input_ids_2, lm_labels)
-
-    def get_config(self):
-        return TransfoXLConfig(
-            vocab_size=self.vocab_size,
-            mem_len=self.mem_len,
-            clamp_len=self.clamp_len,
-            cutoffs=self.cutoffs,
-            d_model=self.hidden_size,
-            d_embed=self.d_embed,
-            n_head=self.num_attention_heads,
-            d_head=self.d_head,
-            d_inner=self.d_inner,
-            div_val=self.div_val,
-            n_layer=self.num_hidden_layers,
-            eos_token_id=self.eos_token_id,
-            pad_token_id=self.pad_token_id,
-        )
-
-    def set_seed(self):
-        random.seed(self.seed)
-        torch.manual_seed(self.seed)
-
-    def create_transfo_xl_model(self, config, input_ids_1, input_ids_2, lm_labels):
-        model = TransfoXLModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        outputs1 = model(input_ids_1)
-        outputs2 = model(input_ids_2, outputs1["mems"])
-        outputs = {
-            "hidden_states_1": outputs1["last_hidden_state"],
-            "mems_1": outputs1["mems"],
-            "hidden_states_2": outputs2["last_hidden_state"],
-            "mems_2": outputs2["mems"],
-        }
-        return outputs
-
-    def check_transfo_xl_model_output(self, result):
-        self.parent.assertEqual(result["hidden_states_1"].shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertEqual(result["hidden_states_2"].shape, (self.batch_size, self.seq_length, self.hidden_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result["mems_1"]],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-        self.parent.assertListEqual(
-            [mem.shape for mem in result["mems_2"]],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_transfo_xl_lm_head(self, config, input_ids_1, input_ids_2, lm_labels):
-        model = TransfoXLLMHeadModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        lm_logits_1 = model(input_ids_1)["prediction_scores"]
-        outputs1 = model(input_ids_1, labels=lm_labels)
-        lm_logits_2 = model(input_ids_2, mems=outputs1["mems"])["prediction_scores"]
-        outputs2 = model(input_ids_2, labels=lm_labels, mems=outputs1["mems"])
-
-        outputs = {
-            "loss_1": outputs1["loss"],
-            "losses_1": outputs1["losses"],
-            "mems_1": outputs1["mems"],
-            "lm_logits_1": lm_logits_1,
-            "loss_2": outputs2["loss"],
-            "losses_2": outputs2["losses"],
-            "mems_2": outputs2["mems"],
-            "lm_logits_2": lm_logits_2,
-        }
-        return outputs
-
-    def check_transfo_xl_lm_head_output(self, result):
-        self.parent.assertEqual(result["loss_1"].shape, ())
-        self.parent.assertEqual(result["losses_1"].shape, (self.batch_size, self.seq_length - 1))
-        self.parent.assertEqual(result["lm_logits_1"].shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result["mems_1"]],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-        self.parent.assertEqual(result["loss_2"].shape, ())
-        self.parent.assertEqual(result["losses_2"].shape, (self.batch_size, self.seq_length - 1))
-        self.parent.assertEqual(result["lm_logits_2"].shape, (self.batch_size, self.seq_length, self.vocab_size))
-        self.parent.assertListEqual(
-            [mem.shape for mem in result["mems_2"]],
-            [(self.mem_len, self.batch_size, self.hidden_size)] * self.num_hidden_layers,
-        )
-
-    def create_transfo_xl_lm_head_trainer_compatible_tuple(self, config, input_ids_1, input_ids_2, lm_labels):
-        config.trainer_compatible = True
-        model = TransfoXLLMHeadModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        lm_logits_1 = model(input_ids_1, return_dict=False)[0]
-        outputs1 = model(input_ids_1, labels=lm_labels, return_dict=False)
-        loss_1, _, losses_1, mems_1 = outputs1[:4]
-        lm_logits_2 = model(input_ids_2, mems=mems_1, return_dict=False)[0]
-        outputs2 = model(input_ids_2, labels=lm_labels, mems=mems_1, return_dict=False)
-        loss_2, _, losses_2, mems_2 = outputs2[:4]
-
-        outputs = {
-            "losses_1": losses_1,
-            "mems_1": mems_1,
-            "lm_logits_1": lm_logits_1,
-            "loss_1": loss_1,
-            "losses_2": losses_2,
-            "mems_2": mems_2,
-            "lm_logits_2": lm_logits_2,
-            "loss_2": loss_2,
-        }
-
-        config.trainer_compatible = None
-        return outputs
-
-    def create_transfo_xl_lm_head_trainer_incompatible_tuple(self, config, input_ids_1, input_ids_2, lm_labels):
-        config.trainer_compatible = False
-        model = TransfoXLLMHeadModel(config)
-        model.to(torch_device)
-        model.eval()
-
-        lm_logits_1 = model(input_ids_1, return_dict=False)[0]
-        outputs1 = model(input_ids_1, labels=lm_labels, return_dict=False)
-        losses_1, _, mems_1 = outputs1[:3]
-        loss_1 = outputs1[-1]
-        lm_logits_2 = model(input_ids_2, mems=mems_1, return_dict=False)[0]
-        outputs2 = model(input_ids_2, labels=lm_labels, mems=mems_1)
-        losses_2, _, mems_2 = outputs2[:3]
-        loss_2 = outputs2[-1]
-
-        outputs = {
-            "losses_1": losses_1,
-            "mems_1": mems_1,
-            "lm_logits_1": lm_logits_1,
-            "loss_1": loss_1,
-            "losses_2": losses_2,
-            "mems_2": mems_2,
-            "lm_logits_2": lm_logits_2,
-            "loss_2": loss_2,
-        }
-
-        config.trainer_compatible = None
-        return outputs
-
-    def create_and_check_transfo_xl_for_sequence_classification(self, config, input_ids_1, input_ids_2, lm_labels):
-        config.num_labels = self.num_labels
-        model = TransfoXLForSequenceClassification(config)
-        model.to(torch_device)
-        model.eval()
-        result = model(input_ids_1)
-        self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
-
-    def prepare_config_and_inputs_for_common(self):
-        config_and_inputs = self.prepare_config_and_inputs()
-        (config, input_ids_1, input_ids_2, lm_labels) = config_and_inputs
-        inputs_dict = {"input_ids": input_ids_1}
-        return config, inputs_dict
-
-
-@require_torch
-class TransfoXLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
-    all_model_classes = (
-        (TransfoXLModel, TransfoXLLMHeadModel, TransfoXLForSequenceClassification) if is_torch_available() else ()
-    )
-    all_generative_model_classes = (TransfoXLLMHeadModel,) if is_torch_available() else ()
-    pipeline_model_mapping = (
-        {
-            "feature-extraction": TransfoXLModel,
-            "text-classification": TransfoXLForSequenceClassification,
-            "text-generation": TransfoXLLMHeadModel,
-            "zero-shot": TransfoXLForSequenceClassification,
-        }
-        if is_torch_available()
-        else {}
-    )
-    test_pruning = False
-    test_resize_embeddings = True
-    test_mismatched_shapes = False
-
-    # TODO: Fix the failed tests
-    def is_pipeline_test_to_skip(
-        self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
-    ):
-        if pipeline_test_casse_name == "TextGenerationPipelineTests":
-            # Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
-            # `TransfoXLConfig` was never used in pipeline tests: cannot create a simple
-            # tokenizer.
-            return True
-
-        return False
-
-    def check_cutoffs_and_n_token(
-        self, copied_cutoffs, layer, model_embed, model, model_class, resized_value, vocab_size
-    ):
-        # Check that the cutoffs were modified accordingly
-        for i in range(len(copied_cutoffs)):
-            if i < layer:
-                self.assertEqual(model_embed.cutoffs[i], copied_cutoffs[i])
-                if model_class == TransfoXLLMHeadModel:
-                    self.assertEqual(model.crit.cutoffs[i], copied_cutoffs[i])
-                if i < len(model.config.cutoffs):
-                    self.assertEqual(model.config.cutoffs[i], copied_cutoffs[i])
-            else:
-                self.assertEqual(model_embed.cutoffs[i], copied_cutoffs[i] + resized_value)
-                if model_class == TransfoXLLMHeadModel:
-                    self.assertEqual(model.crit.cutoffs[i], copied_cutoffs[i] + resized_value)
-                if i < len(model.config.cutoffs):
-                    self.assertEqual(model.config.cutoffs[i], copied_cutoffs[i] + resized_value)
-
-        self.assertEqual(model_embed.n_token, vocab_size + resized_value)
-        if model_class == TransfoXLLMHeadModel:
-            self.assertEqual(model.crit.n_token, vocab_size + resized_value)
-
-    def setUp(self):
-        self.model_tester = TransfoXLModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=TransfoXLConfig, d_embed=37)
-
-    def test_config(self):
-        self.config_tester.run_common_tests()
-
-    def test_transfo_xl_model(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        output_result = self.model_tester.create_transfo_xl_model(*config_and_inputs)
-        self.model_tester.check_transfo_xl_model_output(output_result)
-
-    def test_transfo_xl_lm_head(self):
-        self.model_tester.set_seed()
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-
-        output_result = self.model_tester.create_transfo_xl_lm_head(*config_and_inputs)
-        self.model_tester.check_transfo_xl_lm_head_output(output_result)
-
-        output_result = self.model_tester.create_transfo_xl_lm_head_trainer_compatible_tuple(*config_and_inputs)
-        self.model_tester.check_transfo_xl_lm_head_output(output_result)
-
-        output_result = self.model_tester.create_transfo_xl_lm_head_trainer_incompatible_tuple(*config_and_inputs)
-        self.model_tester.check_transfo_xl_lm_head_output(output_result)
-
-    def test_transfo_xl_sequence_classification_model(self):
-        config_and_inputs = self.model_tester.prepare_config_and_inputs()
-        self.model_tester.create_and_check_transfo_xl_for_sequence_classification(*config_and_inputs)
-
-    def test_retain_grad_hidden_states_attentions(self):
-        # xlnet cannot keep gradients in attentions or hidden states
-        return
-
-    @require_torch_multi_gpu
-    def test_multi_gpu_data_parallel_forward(self):
-        # Opt-out of this test.
-        pass
-
-    @slow
-    def test_model_from_pretrained(self):
-        for model_name in TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
-            model = TransfoXLModel.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-
-    def test_resize_tokens_embeddings(self):
-        (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common()
-        if not self.test_resize_embeddings:
-            return
-
-        for model_class in self.all_model_classes:
-            config = copy.deepcopy(original_config)
-            model = model_class(config)
-            model.to(torch_device)
-
-            if self.model_tester.is_training is False:
-                model.eval()
-
-            model_vocab_size = config.vocab_size
-            # Retrieve the embeddings and clone theme
-            model_embed = model.resize_token_embeddings(model_vocab_size)
-            cloned_embeddings = [emb.weight.clone() for emb in model_embed.emb_layers]
-            # Retrieve the cutoffs and copy them
-            copied_cutoffs = copy.copy(model_embed.cutoffs)
-
-            test_layers = list(range(config.div_val))
-            for layer in test_layers:
-                # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size
-                model_embed = model.resize_token_embeddings(model_vocab_size + 10, layer)
-                self.assertEqual(model.config.vocab_size, model_vocab_size + 10)
-                # Check that it actually resizes the embeddings matrix
-                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0] + 10)
-                # Check that the cutoffs were modified accordingly
-                self.check_cutoffs_and_n_token(
-                    copied_cutoffs, layer, model_embed, model, model_class, 10, model_vocab_size
-                )
-
-                # Check that the model can still do a forward pass successfully (every parameter should be resized)
-                model(**inputs_dict)
-
-                # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size
-                model_embed = model.resize_token_embeddings(model_vocab_size - 5, layer)
-                self.assertEqual(model.config.vocab_size, model_vocab_size - 5)
-                # Check that it actually resizes the embeddings matrix
-                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0] - 5)
-                # Check that the cutoffs were modified accordingly
-                self.check_cutoffs_and_n_token(
-                    copied_cutoffs, layer, model_embed, model, model_class, -5, model_vocab_size
-                )
-
-                # Check that the model can still do a forward pass successfully (every parameter should be resized)
-                # Input ids should be clamped to the maximum size of the vocabulary
-                inputs_dict["input_ids"].clamp_(max=model_vocab_size - 5 - 1)
-                model(**inputs_dict)
-
-                # Check that adding and removing tokens has not modified the first part of the embedding matrix.
-                models_equal = True
-                for p1, p2 in zip(cloned_embeddings[layer], model_embed.emb_layers[layer].weight):
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
-
-                self.assertTrue(models_equal)
-
-                # Reset model embeddings to original size
-                model.resize_token_embeddings(model_vocab_size, layer)
-                self.assertEqual(model_vocab_size, model.config.vocab_size)
-                self.assertEqual(model_embed.emb_layers[layer].weight.shape[0], cloned_embeddings[layer].shape[0])
-
-    def test_resize_embeddings_untied(self):
-        # transfo-xl requires special resize for lm-head
-        return
-
-    def _check_attentions_for_generate(
-        self, batch_size, attentions, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(attentions, tuple)
-        self.assertListEqual(
-            [isinstance(iter_attentions, tuple) for iter_attentions in attentions], [True] * len(attentions)
-        )
-        self.assertEqual(len(attentions), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_attentions in enumerate(attentions):
-            tgt_len = min_length if idx == 0 else (min_length - 2)
-            src_len = (min_length + config.mem_len) if idx == 0 else (min_length + config.mem_len - 2)
-
-            expected_shape = (
-                batch_size * num_beam_groups,
-                config.num_attention_heads,
-                tgt_len,
-                src_len,
-            )
-
-            # check attn size
-            self.assertListEqual(
-                [layer_attention.shape for layer_attention in iter_attentions], [expected_shape] * len(iter_attentions)
-            )
-
-    def _check_hidden_states_for_generate(
-        self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1
-    ):
-        self.assertIsInstance(hidden_states, tuple)
-        self.assertListEqual(
-            [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states],
-            [True] * len(hidden_states),
-        )
-        self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups)
-
-        for idx, iter_hidden_states in enumerate(hidden_states):
-            seq_len = min_length if idx == 0 else min_length - 2
-            expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size)
-            # check hidden size
-            self.assertListEqual(
-                [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states],
-                [expected_shape] * len(iter_hidden_states),
-            )
-
-    # overwrite from test_modeling_common
-    def _mock_init_weights(self, module):
-        if hasattr(module, "weight") and module.weight is not None:
-            module.weight.data.fill_(3)
-        if hasattr(module, "cluster_weight") and module.cluster_weight is not None:
-            module.cluster_weight.data.fill_(3)
-        if hasattr(module, "bias") and module.bias is not None:
-            module.bias.data.fill_(3)
-        if hasattr(module, "cluster_bias") and module.cluster_bias is not None:
-            module.cluster_bias.data.fill_(3)
-
-        if hasattr(module, "emb_projs"):
-            for i in range(len(module.emb_projs)):
-                if module.emb_projs[i] is not None:
-                    nn.init.constant_(module.emb_projs[i], 0.0003)
-        if hasattr(module, "out_projs"):
-            for i in range(len(module.out_projs)):
-                if module.out_projs[i] is not None:
-                    nn.init.constant_(module.out_projs[i], 0.0003)
-
-        for param in ["r_emb", "r_w_bias", "r_r_bias", "r_bias"]:
-            if hasattr(module, param) and getattr(module, param) is not None:
-                weight = getattr(module, param)
-                weight.data.fill_(3)
-
-    @unittest.skip("The model doesn't support left padding")  # and it's not used enough to be worth fixing :)
-    def test_left_padding_compatibility(self):
-        pass
-
-    @unittest.skip("This test is currently broken because of safetensors.")
-    def test_tf_from_pt_safetensors(self):
-        pass
-
-
-@require_torch
-class TransfoXLModelLanguageGenerationTest(unittest.TestCase):
-    @slow
-    def test_lm_generate_transfo_xl_wt103(self):
-        model = TransfoXLLMHeadModel.from_pretrained("transfo-xl-wt103")
-        model.to(torch_device)
-
-        input_ids = torch.tensor([[33, 1297, 2, 1, 1009, 4, 1109, 11739, 4762, 358, 5, 25, 245, 22, 1706, 17, 20098, 5, 3215, 21, 37, 1110, 3, 13, 1041, 4, 24, 603, 490, 2, 71477, 20098, 104447, 2, 20961, 1, 2604, 4, 1, 329, 3, 6224, 831, 16002, 2, 8, 603, 78967, 29546, 23, 803, 20, 25, 416, 5, 8, 232, 4, 277, 6, 1855, 4601, 3, 29546, 54, 8, 3609, 5, 57211, 49, 4, 1, 277, 18, 8, 1755, 15691, 3, 341, 25, 416, 693, 42573, 71, 17, 401, 94, 31, 17919, 2, 29546, 7873, 18, 1, 435, 23, 11011, 755, 5, 5167, 3, 7983, 98, 84, 2, 29546, 3267, 8, 3609, 4, 1, 4865, 1075, 2, 6087, 71, 6, 346, 8, 5854, 3, 29546, 824, 1400, 1868, 2, 19, 160, 2, 311, 8, 5496, 2, 20920, 17, 25, 15097, 3, 24, 24, 0]], dtype=torch.long,device=torch_device)  # fmt: skip
-        #  In 1991 , the remains of Russian Tsar Nicholas II and his family
-        #  ( except for Alexei and Maria ) are discovered .
-        #  The voice of Nicholas's young son , Tsarevich Alexei Nikolaevich , narrates the
-        #  remainder of the story . 1883 Western Siberia ,
-        #  a young Grigori Rasputin is asked by his father and a group of men to perform magic .
-        #  Rasputin has a vision and denounces one of the men as a horse thief . Although his
-        #  father initially slaps him for making such an accusation , Rasputin watches as the
-        #  man is chased outside and beaten . Twenty years later , Rasputin sees a vision of
-        #  the Virgin Mary , prompting him to become a priest . Rasputin quickly becomes famous ,
-        #  with people , even a bishop , begging for his blessing . <eod> </s> <eos>
-
-        expected_output_ids = [33,1297,2,1,1009,4,1109,11739,4762,358,5,25,245,22,1706,17,20098,5,3215,21,37,1110,3,13,1041,4,24,603,490,2,71477,20098,104447,2,20961,1,2604,4,1,329,3,6224,831,16002,2,8,603,78967,29546,23,803,20,25,416,5,8,232,4,277,6,1855,4601,3,29546,54,8,3609,5,57211,49,4,1,277,18,8,1755,15691,3,341,25,416,693,42573,71,17,401,94,31,17919,2,29546,7873,18,1,435,23,11011,755,5,5167,3,7983,98,84,2,29546,3267,8,3609,4,1,4865,1075,2,6087,71,6,346,8,5854,3,29546,824,1400,1868,2,19,160,2,311,8,5496,2,20920,17,25,15097,3,24,24,0,33,1,142,1298,188,2,29546,113,8,3654,4,1,1109,7136,833,3,13,1645,4,29546,11,104,7,1,1109,532,7129,2,10,83507,2,1162,1123,2,6,7245,10,2,5,11,104,7,1,1109,532,7129,2,10,24,24,10,22,10,13,770,5863,4,7245,10]  # fmt: skip
-        #  In 1991, the remains of Russian Tsar Nicholas II and his family ( except for
-        #  Alexei and Maria ) are discovered. The voice of young son, Tsarevich Alexei
-        #  Nikolaevich, narrates the remainder of the story. 1883 Western Siberia, a young
-        #  Grigori Rasputin is asked by his father and a group of men to perform magic.
-        #  Rasputin has a vision and denounces one of the men as a horse thief. Although
-        #  his father initially slaps him for making such an accusation, Rasputin watches
-        #  as the man is chased outside and beaten. Twenty years later, Rasputin sees a
-        #  vision of the Virgin Mary, prompting him to become a priest. Rasputin quickly
-        #  becomes famous, with people, even a bishop, begging for his blessing. In the
-        #  early 20th century, Rasputin became a symbol of the Russian Orthodox Church.
-        #  The image of Rasputin was used in the Russian national anthem, " Nearer, My God,
-        #  to Heaven ", and was used in the Russian national anthem, " " ( " The Great Spirit
-        #  of Heaven "
-
-        output_ids = model.generate(input_ids, max_length=200, do_sample=False)
-        self.assertListEqual(output_ids[0].tolist(), expected_output_ids)
diff --git a/tests/models/transfo_xl/test_tokenization_transfo_xl.py b/tests/models/transfo_xl/test_tokenization_transfo_xl.py
deleted file mode 100644
index 15b712ff3784e3..00000000000000
--- a/tests/models/transfo_xl/test_tokenization_transfo_xl.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import os
-import unittest
-
-from transformers.models.transfo_xl.tokenization_transfo_xl import VOCAB_FILES_NAMES, TransfoXLTokenizer
-
-from ...test_tokenization_common import TokenizerTesterMixin
-
-
-class TransfoXLTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
-    tokenizer_class = TransfoXLTokenizer
-    test_rust_tokenizer = False
-    test_seq2seq = False
-
-    def setUp(self):
-        super().setUp()
-
-        vocab_tokens = [
-            "<unk>",
-            "[CLS]",
-            "[SEP]",
-            "want",
-            "unwanted",
-            "wa",
-            "un",
-            "running",
-            ",",
-            "low",
-            "l",
-        ]
-        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
-        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
-            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
-
-    def get_tokenizer(self, **kwargs):
-        kwargs["lower_case"] = True
-        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, **kwargs)
-
-    def get_input_output_texts(self, tokenizer):
-        input_text = "<unk> UNwanted , running"
-        output_text = "<unk> unwanted, running"
-        return input_text, output_text
-
-    def test_full_tokenizer(self):
-        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
-
-        tokens = tokenizer.tokenize("<unk> UNwanted , running")
-        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
-
-        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
-
-    def test_full_tokenizer_lower(self):
-        tokenizer = TransfoXLTokenizer(lower_case=True)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["hello", "!", "how", "are", "you", "?"]
-        )
-
-    def test_full_tokenizer_no_lower(self):
-        tokenizer = TransfoXLTokenizer(lower_case=False)
-
-        self.assertListEqual(
-            tokenizer.tokenize(" \tHeLLo ! how  \n Are yoU ?  "), ["HeLLo", "!", "how", "Are", "yoU", "?"]
-        )
-
-    def test_full_tokenizer_moses_numbers(self):
-        tokenizer = TransfoXLTokenizer(lower_case=False)
-        text_in = "Hello (bracket) and side-scrolled [and] Henry's $5,000 with 3.34 m. What's up!?"
-        tokens_out = [
-            "Hello",
-            "(",
-            "bracket",
-            ")",
-            "and",
-            "side",
-            "@-@",
-            "scrolled",
-            "[",
-            "and",
-            "]",
-            "Henry",
-            "'s",
-            "$",
-            "5",
-            "@,@",
-            "000",
-            "with",
-            "3",
-            "@.@",
-            "34",
-            "m",
-            ".",
-            "What",
-            "'s",
-            "up",
-            "!",
-            "?",
-        ]
-
-        self.assertListEqual(tokenizer.tokenize(text_in), tokens_out)
-
-        self.assertEqual(tokenizer.convert_tokens_to_string(tokens_out), text_in)
-
-    def test_move_added_token(self):
-        tokenizer = self.get_tokenizer()
-        original_len = len(tokenizer)
-
-        tokenizer.add_tokens(["new1", "new2"])
-        tokenizer.move_added_token("new1", 1)
-
-        # Check that moved token is not copied (duplicate)
-        self.assertEqual(len(tokenizer), original_len + 2)
-        # Check that token is moved to specified id
-        self.assertEqual(tokenizer.encode("new1"), [1])
-        self.assertEqual(tokenizer.decode([1]), "new1")
diff --git a/tests/models/tvp/__init__.py b/tests/models/tvp/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/tvp/test_image_processing_tvp.py b/tests/models/tvp/test_image_processing_tvp.py
new file mode 100644
index 00000000000000..1c9a84beb8427b
--- /dev/null
+++ b/tests/models/tvp/test_image_processing_tvp.py
@@ -0,0 +1,306 @@
+# coding=utf-8
+# Copyright 2023 The Intel Team Authors, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import unittest
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+
+from transformers.image_transforms import PaddingMode
+from transformers.testing_utils import require_torch, require_vision
+from transformers.utils import is_torch_available, is_vision_available
+
+from ...test_image_processing_common import ImageProcessingTestMixin, prepare_video_inputs
+
+
+if is_torch_available():
+    import torch
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import TvpImageProcessor
+
+
+class TvpImageProcessingTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        do_resize: bool = True,
+        size: Dict[str, int] = {"longest_edge": 40},
+        do_center_crop: bool = False,
+        crop_size: Dict[str, int] = None,
+        do_rescale: bool = False,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_pad: bool = True,
+        pad_size: Dict[str, int] = {"height": 80, "width": 80},
+        fill: int = None,
+        pad_mode: PaddingMode = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, List[float]]] = [0.48145466, 0.4578275, 0.40821073],
+        image_std: Optional[Union[float, List[float]]] = [0.26862954, 0.26130258, 0.27577711],
+        batch_size=2,
+        min_resolution=40,
+        max_resolution=80,
+        num_channels=3,
+        num_frames=2,
+    ):
+        self.do_resize = do_resize
+        self.size = size
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+        self.fill = fill
+        self.pad_mode = pad_mode
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.batch_size = batch_size
+        self.num_channels = num_channels
+        self.min_resolution = min_resolution
+        self.max_resolution = max_resolution
+        self.num_frames = num_frames
+
+    def prepare_image_processor_dict(self):
+        return {
+            "image_mean": self.image_mean,
+            "image_std": self.image_std,
+            "do_normalize": self.do_normalize,
+            "do_resize": self.do_resize,
+            "size": self.size,
+            "do_rescale": self.do_rescale,
+            "do_center_crop": self.do_center_crop,
+            "do_pad": self.do_pad,
+            "pad_size": self.pad_size,
+        }
+
+    def get_expected_values(self, image_inputs, batched=False):
+        """
+        This function computes the expected height and width when providing images to TvpImageProcessor,
+        assuming do_resize is set to True with a scalar size.
+        """
+        if not batched:
+            return (int(self.pad_size["height"]), int(self.pad_size["width"]))
+
+        else:
+            expected_values = []
+            for image in image_inputs:
+                expected_height, expected_width = self.get_expected_values([image])
+                expected_values.append((expected_height, expected_width))
+            expected_height = max(expected_values, key=lambda item: item[0])[0]
+            expected_width = max(expected_values, key=lambda item: item[1])[1]
+
+        return expected_height, expected_width
+
+    def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
+        return prepare_video_inputs(
+            batch_size=self.batch_size,
+            num_frames=self.num_frames,
+            num_channels=self.num_channels,
+            min_resolution=self.min_resolution,
+            max_resolution=self.max_resolution,
+            equal_resolution=equal_resolution,
+            numpify=numpify,
+            torchify=torchify,
+        )
+
+
+@require_torch
+@require_vision
+class TvpImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+    image_processing_class = TvpImageProcessor if is_vision_available() else None
+
+    def setUp(self):
+        self.image_processor_tester = TvpImageProcessingTester(self)
+
+    @property
+    def image_processor_dict(self):
+        return self.image_processor_tester.prepare_image_processor_dict()
+
+    def test_image_processor_properties(self):
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        self.assertTrue(hasattr(image_processing, "image_mean"))
+        self.assertTrue(hasattr(image_processing, "image_std"))
+        self.assertTrue(hasattr(image_processing, "do_normalize"))
+        self.assertTrue(hasattr(image_processing, "do_resize"))
+        self.assertTrue(hasattr(image_processing, "do_center_crop"))
+        self.assertTrue(hasattr(image_processing, "size"))
+        self.assertTrue(hasattr(image_processing, "do_rescale"))
+        self.assertTrue(hasattr(image_processing, "do_pad"))
+        self.assertTrue(hasattr(image_processing, "pad_size"))
+
+    def test_image_processor_from_dict_with_kwargs(self):
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict)
+        self.assertEqual(image_processor.size, {"longest_edge": 40})
+
+        image_processor = self.image_processing_class.from_dict(self.image_processor_dict, size={"longest_edge": 12})
+        self.assertEqual(image_processor.size, {"longest_edge": 12})
+
+    def test_call_pil(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PIL videos
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], Image.Image)
+
+        # Test not batched input
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+    def test_call_numpy_4_channels(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random numpy tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, numpify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], np.ndarray)
+
+        # Test not batched input
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
+        encoded_videos = image_processing(
+            video_inputs[0], return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
+        encoded_videos = image_processing(
+            video_inputs, return_tensors="pt", image_mean=0, image_std=1, input_data_format="channels_first"
+        ).pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+        self.image_processor_tester.num_channels = 3
+
+    def test_call_pytorch(self):
+        # Initialize image_processing
+        image_processing = self.image_processing_class(**self.image_processor_dict)
+        # create random PyTorch tensors
+        video_inputs = self.image_processor_tester.prepare_video_inputs(equal_resolution=False, torchify=True)
+        for video in video_inputs:
+            self.assertIsInstance(video, list)
+            self.assertIsInstance(video[0], torch.Tensor)
+
+        # Test not batched input
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs)
+        encoded_videos = image_processing(video_inputs[0], return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                1,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
+
+        # Test batched
+        expected_height, expected_width = self.image_processor_tester.get_expected_values(video_inputs, batched=True)
+        encoded_videos = image_processing(video_inputs, return_tensors="pt").pixel_values
+        self.assertEqual(
+            encoded_videos.shape,
+            (
+                self.image_processor_tester.batch_size,
+                self.image_processor_tester.num_frames,
+                self.image_processor_tester.num_channels,
+                expected_height,
+                expected_width,
+            ),
+        )
diff --git a/tests/models/tvp/test_modeling_tvp.py b/tests/models/tvp/test_modeling_tvp.py
new file mode 100644
index 00000000000000..14ec02ed6fd9a9
--- /dev/null
+++ b/tests/models/tvp/test_modeling_tvp.py
@@ -0,0 +1,266 @@
+# coding=utf-8
+# Copyright 2023 The Intel Team Authors, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch TVP model. """
+
+
+import unittest
+
+from transformers import ResNetConfig, TvpConfig
+from transformers.testing_utils import require_torch, require_vision, torch_device
+from transformers.utils import cached_property, is_torch_available, is_vision_available
+
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    _config_zero_init,
+    floats_tensor,
+    ids_tensor,
+    random_attention_mask,
+)
+from ...test_pipeline_mixin import PipelineTesterMixin
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import TvpForVideoGrounding, TvpModel
+
+if is_vision_available():
+    from PIL import Image
+
+    from transformers import TvpImageProcessor
+
+
+# Copied from test.models.videomae.test_modeling_videomae.VideoMAEModelTester with VideoMAE->TVP
+class TVPModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=1,
+        seq_length=2,
+        alpha=1.0,
+        beta=0.1,
+        visual_prompter_type="framepad",
+        visual_prompter_apply="replace",
+        num_frames=2,
+        max_img_size=448,
+        visual_prompt_size=96,
+        vocab_size=100,
+        hidden_size=32,
+        intermediate_size=32,
+        num_hidden_layers=2,
+        num_attention_heads=4,
+        max_position_embeddings=30,
+        max_grid_col_position_embeddings=30,
+        max_grid_row_position_embeddings=30,
+        hidden_dropout_prob=0.1,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        initializer_range=0.02,
+        pad_token_id=0,
+        type_vocab_size=2,
+        attention_probs_dropout_prob=0.1,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.input_id_length = seq_length
+        self.seq_length = seq_length + 10 + 784  # include text prompt length and visual input length
+        self.alpha = alpha
+        self.beta = beta
+        self.visual_prompter_type = visual_prompter_type
+        self.visual_prompter_apply = visual_prompter_apply
+        self.num_frames = num_frames
+        self.max_img_size = max_img_size
+        self.visual_prompt_size = visual_prompt_size
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_grid_col_position_embeddings = max_grid_col_position_embeddings
+        self.max_grid_row_position_embeddings = max_grid_row_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.pad_token_id = pad_token_id
+        self.type_vocab_size = type_vocab_size
+        self.is_training = False
+        self.num_channels = 3
+
+    def prepare_config_and_inputs(self):
+        input_ids = ids_tensor([self.batch_size, self.input_id_length], self.vocab_size)
+        attention_mask = random_attention_mask([self.batch_size, self.input_id_length])
+        pixel_values = floats_tensor(
+            [self.batch_size, self.num_frames, self.num_channels, self.max_img_size, self.max_img_size]
+        )
+
+        config = self.get_config()
+
+        return (config, input_ids, pixel_values, attention_mask)
+
+    def get_config(self):
+        resnet_config = ResNetConfig(
+            num_channels=3,
+            embeddings_size=64,
+            hidden_sizes=[64, 128],
+            depths=[2, 2],
+            hidden_act="relu",
+            out_features=["stage2"],
+            out_indices=[2],
+        )
+        return TvpConfig(
+            backbone_config=resnet_config,
+            alpha=self.alpha,
+            beta=self.beta,
+            visual_prompter_type=self.visual_prompter_type,
+            visual_prompter_apply=self.visual_prompter_apply,
+            num_frames=self.num_frames,
+            max_img_size=self.max_img_size,
+            visual_prompt_size=self.visual_prompt_size,
+            vocab_size=self.vocab_size,
+            hidden_size=self.hidden_size,
+            num_hidden_layers=self.num_hidden_layers,
+            num_attention_heads=self.num_attention_heads,
+            intermediate_size=self.intermediate_size,
+            hidden_act=self.hidden_act,
+            hidden_dropout_prob=self.hidden_dropout_prob,
+            attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+            max_position_embeddings=self.max_position_embeddings,
+            max_grid_col_position_embeddings=self.max_grid_col_position_embeddings,
+            max_grid_row_position_embeddings=self.max_grid_row_position_embeddings,
+            layer_norm_eps=self.layer_norm_eps,
+            initializer_range=self.initializer_range,
+            pad_token_id=self.pad_token_id,
+            type_vocab_size=self.type_vocab_size,
+        )
+
+    def create_and_check_model(self, config, input_ids, pixel_values, attention_mask):
+        model = TvpModel(config=config)
+        model.to(torch_device)
+        model.eval()
+        result = model(input_ids, pixel_values, attention_mask)
+        self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, input_ids, pixel_values, attention_mask = config_and_inputs
+        inputs_dict = {"input_ids": input_ids, "pixel_values": pixel_values, "attention_mask": attention_mask}
+        return config, inputs_dict
+
+
+@require_torch
+class TVPModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase):
+    """
+    Here we also overwrite some of the tests of test_modeling_common.py, as TVP does not use, inputs_embeds.
+    The seq_length in TVP contain textual and visual inputs, and prompt.
+    """
+
+    all_model_classes = (TvpModel, TvpForVideoGrounding) if is_torch_available() else ()
+    pipeline_model_mapping = (
+        {"feature-extraction": TvpModel, "temporal-video-grounding": TvpForVideoGrounding}
+        if is_torch_available()
+        else {}
+    )
+
+    # TODO: Enable this once this model gets more usage
+    test_torchscript = False
+
+    def setUp(self):
+        self.model_tester = TVPModelTester(self)
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    @unittest.skip(reason="TVP does not use inputs_embeds")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="TVPModel does not have input/output embeddings")
+    def test_model_common_attributes(self):
+        pass
+
+    # override as the `logit_scale` parameter initilization is different for TVP
+    def test_initialization(self):
+        config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        configs_no_init = _config_zero_init(config)
+        for model_class in self.all_model_classes:
+            model = model_class(config=configs_no_init)
+            for name, param in model.named_parameters():
+                if param.requires_grad:
+                    # params are randomly initialized.
+                    self.assertAlmostEqual(
+                        param.data.mean().item(),
+                        0.0,
+                        delta=1.0,
+                        msg=f"Parameter {name} of model {model_class} seems not properly initialized",
+                    )
+
+
+# We will verify our results on an image of cute cats
+def prepare_img():
+    image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
+    return image
+
+
+@require_vision
+@require_torch
+class TvpModelIntegrationTests(unittest.TestCase):
+    @cached_property
+    def default_image_processor(self):
+        return TvpImageProcessor.from_pretrained("Jiqing/tiny-random-tvp") if is_vision_available() else None
+
+    def test_inference_no_head(self):
+        model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt")
+        input_ids = torch.tensor([[1, 2]])
+        attention_mask = torch.tensor([[1, 1]])
+        encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
+        encoding.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 796, 128))
+        assert outputs.last_hidden_state.shape == expected_shape
+        expected_slice = torch.tensor(
+            [[-0.4902, -0.4121, -1.7872], [-0.2184, 2.1211, -0.9371], [0.1180, 0.5003, -0.1727]]
+        ).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.last_hidden_state[0, :3, :3], expected_slice, atol=1e-4))
+
+    def test_inference_with_head(self):
+        model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp").to(torch_device)
+
+        image_processor = self.default_image_processor
+        image = prepare_img()
+        encoding = image_processor(images=image, return_tensors="pt")
+        input_ids = torch.tensor([[1, 2]])
+        attention_mask = torch.tensor([[1, 1]])
+        encoding.update({"input_ids": input_ids, "attention_mask": attention_mask})
+        encoding.to(torch_device)
+
+        with torch.no_grad():
+            outputs = model(**encoding)
+
+        expected_shape = torch.Size((1, 2))
+        assert outputs.logits.shape == expected_shape
+        expected_slice = torch.tensor([[0.5061, 0.4988]]).to(torch_device)
+        self.assertTrue(torch.allclose(outputs.logits, expected_slice, atol=1e-4))
diff --git a/tests/models/univnet/__init__.py b/tests/models/univnet/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/univnet/test_feature_extraction_univnet.py b/tests/models/univnet/test_feature_extraction_univnet.py
new file mode 100644
index 00000000000000..dfa335d15383ee
--- /dev/null
+++ b/tests/models/univnet/test_feature_extraction_univnet.py
@@ -0,0 +1,365 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import itertools
+import os
+import random
+import tempfile
+import unittest
+
+import numpy as np
+from datasets import Audio, load_dataset
+
+from transformers import UnivNetFeatureExtractor
+from transformers.testing_utils import check_json_file_has_correct_format, require_torch, slow
+from transformers.utils.import_utils import is_torch_available
+
+from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
+
+
+if is_torch_available():
+    import torch
+
+
+global_rng = random.Random()
+
+
+# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
+def floats_list(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor"""
+    if rng is None:
+        rng = global_rng
+
+    values = []
+    for batch_idx in range(shape[0]):
+        values.append([])
+        for _ in range(shape[1]):
+            values[-1].append(rng.random() * scale)
+
+    return values
+
+
+class UnivNetFeatureExtractionTester(unittest.TestCase):
+    def __init__(
+        self,
+        parent,
+        batch_size=7,
+        min_seq_length=400,
+        max_seq_length=2000,
+        feature_size=1,
+        sampling_rate=24000,
+        padding_value=0.0,
+        do_normalize=True,
+        num_mel_bins=100,
+        hop_length=256,
+        win_length=1024,
+        win_function="hann_window",
+        filter_length=1024,
+        max_length_s=10,
+        fmin=0.0,
+        fmax=12000,
+        mel_floor=1e-9,
+        center=False,
+        compression_factor=1.0,
+        compression_clip_val=1e-5,
+        normalize_min=-11.512925148010254,
+        normalize_max=2.3143386840820312,
+        model_in_channels=64,
+        pad_end_length=10,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.min_seq_length = min_seq_length
+        self.max_seq_length = max_seq_length
+        self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
+
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.do_normalize = do_normalize
+        self.num_mel_bins = num_mel_bins
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.win_function = win_function
+        self.filter_length = filter_length
+        self.max_length_s = max_length_s
+        self.fmin = fmin
+        self.fmax = fmax
+        self.mel_floor = mel_floor
+        self.center = center
+        self.compression_factor = compression_factor
+        self.compression_clip_val = compression_clip_val
+        self.normalize_min = normalize_min
+        self.normalize_max = normalize_max
+        self.model_in_channels = model_in_channels
+        self.pad_end_length = pad_end_length
+
+    def prepare_feat_extract_dict(self):
+        return {
+            "feature_size": self.feature_size,
+            "sampling_rate": self.sampling_rate,
+            "padding_value": self.padding_value,
+            "do_normalize": self.do_normalize,
+            "num_mel_bins": self.num_mel_bins,
+            "hop_length": self.hop_length,
+            "win_length": self.win_length,
+            "win_function": self.win_function,
+            "filter_length": self.filter_length,
+            "max_length_s": self.max_length_s,
+            "fmin": self.fmin,
+            "fmax": self.fmax,
+            "mel_floor": self.mel_floor,
+            "center": self.center,
+            "compression_factor": self.compression_factor,
+            "compression_clip_val": self.compression_clip_val,
+            "normalize_min": self.normalize_min,
+            "normalize_max": self.normalize_max,
+            "model_in_channels": self.model_in_channels,
+            "pad_end_length": self.pad_end_length,
+        }
+
+    def prepare_inputs_for_common(self, equal_length=False, numpify=False):
+        def _flatten(list_of_lists):
+            return list(itertools.chain(*list_of_lists))
+
+        if equal_length:
+            speech_inputs = floats_list((self.batch_size, self.max_seq_length))
+        else:
+            # make sure that inputs increase in size
+            speech_inputs = [
+                _flatten(floats_list((x, self.feature_size)))
+                for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
+            ]
+
+        if numpify:
+            speech_inputs = [np.asarray(x) for x in speech_inputs]
+
+        return speech_inputs
+
+
+class UnivNetFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
+    feature_extraction_class = UnivNetFeatureExtractor
+
+    def setUp(self):
+        self.feat_extract_tester = UnivNetFeatureExtractionTester(self)
+
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_from_and_save_pretrained
+    def test_feat_extract_from_and_save_pretrained(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
+            check_json_file_has_correct_format(saved_file)
+            feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_feat_extract_to_json_file
+    def test_feat_extract_to_json_file(self):
+        feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "feat_extract.json")
+            feat_extract_first.to_json_file(json_file_path)
+            feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
+
+        dict_first = feat_extract_first.to_dict()
+        dict_second = feat_extract_second.to_dict()
+        mel_1 = feat_extract_first.mel_filters
+        mel_2 = feat_extract_second.mel_filters
+        self.assertTrue(np.allclose(mel_1, mel_2))
+        self.assertEqual(dict_first, dict_second)
+
+    def test_call(self):
+        # Tests that all call wrap to encode_plus and batch_encode_plus
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        # create three inputs of length 800, 1000, and 1200
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        # Test feature size
+        input_features = feature_extractor(
+            np_speech_inputs, padding="max_length", max_length=1600, return_tensors="np"
+        ).input_features
+        self.assertTrue(input_features.ndim == 3)
+        # Note: for some reason I get a weird padding error when feature_size > 1
+        # self.assertTrue(input_features.shape[-2] == feature_extractor.feature_size)
+        # Note: we use the shape convention (batch_size, seq_len, num_mel_bins)
+        self.assertTrue(input_features.shape[-1] == feature_extractor.num_mel_bins)
+
+        # Test not batched input
+        encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_features
+        self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
+
+        # Test batched
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test 2-D numpy arrays are batched.
+        speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
+        np_speech_inputs = np.asarray(speech_inputs)
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test truncation required
+        speech_inputs = [
+            floats_list((1, x))[0]
+            for x in range((feature_extractor.num_max_samples - 100), (feature_extractor.num_max_samples + 500), 200)
+        ]
+        np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
+
+        speech_inputs_truncated = [x[: feature_extractor.num_max_samples] for x in speech_inputs]
+        np_speech_inputs_truncated = [np.asarray(speech_input) for speech_input in speech_inputs_truncated]
+
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(np_speech_inputs_truncated, return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_batched_unbatched_consistency(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = floats_list((1, 800))[0]
+        np_speech_inputs = np.asarray(speech_inputs)
+
+        # Test unbatched vs batched list
+        encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor([speech_inputs], return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test np.ndarray vs List[np.ndarray]
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor([np_speech_inputs], return_tensors="np").input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+        # Test unbatched np.ndarray vs batched np.ndarray
+        encoded_sequences_1 = feature_extractor(np_speech_inputs, return_tensors="np").input_features
+        encoded_sequences_2 = feature_extractor(
+            np.expand_dims(np_speech_inputs, axis=0), return_tensors="np"
+        ).input_features
+        for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
+            self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
+
+    def test_generate_noise(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        features = feature_extractor(speech_inputs, return_noise=True)
+        input_features = features.input_features
+        noise_features = features.noise_sequence
+
+        for spectrogram, noise in zip(input_features, noise_features):
+            self.assertEqual(spectrogram.shape[0], noise.shape[0])
+
+    def test_pad_end(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        input_features1 = feature_extractor(speech_inputs, padding=False, pad_end=False).input_features
+        input_features2 = feature_extractor(speech_inputs, padding=False, pad_end=True).input_features
+
+        for spectrogram1, spectrogram2 in zip(input_features1, input_features2):
+            self.assertEqual(spectrogram1.shape[0] + self.feat_extract_tester.pad_end_length, spectrogram2.shape[0])
+
+    def test_generate_noise_and_pad_end(self):
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
+
+        features = feature_extractor(speech_inputs, padding=False, return_noise=True, pad_end=True)
+        input_features = features.input_features
+        noise_features = features.noise_sequence
+
+        for spectrogram, noise in zip(input_features, noise_features):
+            self.assertEqual(spectrogram.shape[0], noise.shape[0])
+
+    @require_torch
+    def test_batch_decode(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_dict)
+        input_lengths = list(range(800, 1400, 200))
+        pad_samples = feature_extractor.pad_end_length * feature_extractor.hop_length
+        output_features = {
+            "waveforms": torch.tensor(floats_list((3, max(input_lengths) + pad_samples))),
+            "waveform_lengths": torch.tensor(input_lengths),
+        }
+        waveforms = feature_extractor.batch_decode(**output_features)
+
+        for input_length, waveform in zip(input_lengths, waveforms):
+            self.assertTrue(len(waveform.shape) == 1, msg="Individual output waveforms should be 1D")
+            self.assertEqual(waveform.shape[0], input_length)
+
+    @require_torch
+    # Copied from tests.models.whisper.test_feature_extraction_whisper.WhisperFeatureExtractionTest.test_double_precision_pad
+    def test_double_precision_pad(self):
+        import torch
+
+        feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
+        np_speech_inputs = np.random.rand(100, 32).astype(np.float64)
+        py_speech_inputs = np_speech_inputs.tolist()
+
+        for inputs in [py_speech_inputs, np_speech_inputs]:
+            np_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="np")
+            self.assertTrue(np_processed.input_features.dtype == np.float32)
+            pt_processed = feature_extractor.pad([{"input_features": inputs}], return_tensors="pt")
+            self.assertTrue(pt_processed.input_features.dtype == torch.float32)
+
+    def _load_datasamples(self, num_samples):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.cast_column("audio", Audio(sampling_rate=self.feat_extract_tester.sampling_rate))
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
+
+    @slow
+    @require_torch
+    def test_integration(self):
+        # fmt: off
+        EXPECTED_INPUT_FEATURES = torch.tensor(
+            [
+                -5.0229, -6.1358, -5.8346, -5.4447, -5.6707, -5.8577, -5.0464, -5.0058,
+                -5.6015, -5.6410, -5.4325, -5.6116, -5.3700, -5.7956, -5.3196, -5.3274,
+                -5.9655, -5.6057, -5.8382, -5.9602, -5.9005, -5.9123, -5.7669, -6.1441,
+                -5.5168, -5.1405, -5.3927, -6.0032, -5.5784, -5.3728
+            ],
+        )
+        # fmt: on
+
+        input_speech, sr = self._load_datasamples(1)
+
+        feature_extractor = UnivNetFeatureExtractor()
+        input_features = feature_extractor(input_speech, sampling_rate=sr[0], return_tensors="pt").input_features
+        self.assertEqual(input_features.shape, (1, 548, 100))
+
+        input_features_mean = torch.mean(input_features)
+        input_features_stddev = torch.std(input_features)
+
+        EXPECTED_MEAN = torch.tensor(-6.18862009)
+        EXPECTED_STDDEV = torch.tensor(2.80845642)
+
+        torch.testing.assert_close(input_features_mean, EXPECTED_MEAN, atol=5e-5, rtol=5e-6)
+        torch.testing.assert_close(input_features_stddev, EXPECTED_STDDEV)
+        torch.testing.assert_close(input_features[0, :30, 0], EXPECTED_INPUT_FEATURES, atol=1e-4, rtol=1e-5)
diff --git a/tests/models/univnet/test_modeling_univnet.py b/tests/models/univnet/test_modeling_univnet.py
new file mode 100644
index 00000000000000..b1512af284ef01
--- /dev/null
+++ b/tests/models/univnet/test_modeling_univnet.py
@@ -0,0 +1,369 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import inspect
+import random
+import unittest
+
+from datasets import Audio, load_dataset
+
+from transformers import UnivNetConfig, UnivNetFeatureExtractor
+from transformers.testing_utils import (
+    is_torch_available,
+    require_torch,
+    require_torch_gpu,
+    slow,
+    torch_device,
+)
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import (
+    ModelTesterMixin,
+    floats_tensor,
+)
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import UnivNetModel
+
+
+class UnivNetModelTester:
+    def __init__(
+        self,
+        parent,
+        batch_size=2,
+        seq_length=7,
+        in_channels=8,
+        hidden_channels=8,
+        num_mel_bins=20,
+        kernel_predictor_hidden_channels=8,
+        seed=0,
+        is_training=False,
+    ):
+        self.parent = parent
+        self.batch_size = batch_size
+        self.seq_length = seq_length
+        self.in_channels = in_channels
+        self.hidden_channels = hidden_channels
+        self.num_mel_bins = num_mel_bins
+        self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
+        self.seed = seed
+        self.is_training = is_training
+
+    def prepare_noise_sequence(self):
+        generator = torch.manual_seed(self.seed)
+        noise_shape = (self.seq_length, self.in_channels)
+        # Create noise on CPU for reproducibility
+        noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
+        return noise_sequence
+
+    def prepare_config_and_inputs(self):
+        spectrogram = floats_tensor([self.seq_length, self.num_mel_bins], scale=1.0)
+        noise_sequence = self.prepare_noise_sequence()
+        noise_sequence = noise_sequence.to(spectrogram.device)
+        config = self.get_config()
+        return config, spectrogram, noise_sequence
+
+    def get_config(self):
+        return UnivNetConfig(
+            model_in_channels=self.in_channels,
+            model_hidden_channels=self.hidden_channels,
+            num_mel_bins=self.num_mel_bins,
+            kernel_predictor_hidden_channels=self.kernel_predictor_hidden_channels,
+        )
+
+    def create_and_check_model(self, config, spectrogram, noise_sequence):
+        model = UnivNetModel(config=config).to(torch_device).eval()
+        result = model(spectrogram, noise_sequence)[0]
+        self.parent.assertEqual(result.shape, (1, self.seq_length * 256))
+
+    def prepare_config_and_inputs_for_common(self):
+        config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
+        inputs_dict = {"input_features": spectrogram, "noise_sequence": noise_sequence}
+        return config, inputs_dict
+
+
+@require_torch
+class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
+    all_model_classes = (UnivNetModel,) if is_torch_available() else ()
+    # UnivNetModel currently cannot be traced with torch.jit.trace.
+    test_torchscript = False
+    # The UnivNetModel is not a transformer and does not use any attention mechanisms, so skip transformer/attention
+    # related tests.
+    test_pruning = False
+    test_resize_embeddings = False
+    test_resize_position_embeddings = False
+    test_head_masking = False
+    # UnivNetModel is not a sequence classification model.
+    test_mismatched_shapes = False
+    # UnivNetModel does not have a base_model_prefix attribute.
+    test_missing_keys = False
+    # UnivNetModel does not implement a parallelize method.
+    test_model_parallel = False
+    is_encoder_decoder = False
+    has_attentions = False
+
+    input_name = "input_features"
+
+    def setUp(self):
+        self.model_tester = UnivNetModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=UnivNetConfig)
+
+    @unittest.skip(reason="fix this once it gets more usage")
+    def test_multi_gpu_data_parallel_forward(self):
+        super().test_multi_gpu_data_parallel_forward()
+
+    def test_config(self):
+        self.config_tester.create_and_test_config_to_json_string()
+        self.config_tester.create_and_test_config_to_json_file()
+        self.config_tester.create_and_test_config_from_and_save_pretrained()
+        self.config_tester.create_and_test_config_from_and_save_pretrained_subfolder()
+        self.config_tester.create_and_test_config_with_num_labels()
+        self.config_tester.check_config_can_be_init_without_params()
+        self.config_tester.check_config_arguments_init()
+
+    def test_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_model(*config_and_inputs)
+
+    def test_forward_signature(self):
+        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            signature = inspect.signature(model.forward)
+            # signature.parameters is an OrderedDict => so arg_names order is deterministic
+            arg_names = [*signature.parameters.keys()]
+
+            expected_arg_names = [
+                "input_features",
+            ]
+            self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+
+    @unittest.skip(reason="UnivNetModel does not output hidden_states.")
+    def test_hidden_states_output(self):
+        pass
+
+    @unittest.skip(reason="UnivNetModel.forward does not accept an inputs_embeds argument.")
+    def test_inputs_embeds(self):
+        pass
+
+    @unittest.skip(reason="UnivNetModel does not use input embeddings and thus has no get_input_embeddings method.")
+    def test_model_common_attributes(self):
+        pass
+
+    @unittest.skip(reason="UnivNetModel does not support all arguments tested, such as output_hidden_states.")
+    def test_model_outputs_equivalence(self):
+        pass
+
+    @unittest.skip(reason="UnivNetModel does not output hidden_states.")
+    def test_retain_grad_hidden_states_attentions(self):
+        pass
+
+    def test_batched_inputs_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            batched_spectrogram = inputs["input_features"].unsqueeze(0).repeat(2, 1, 1)
+            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0).repeat(2, 1, 1)
+            with torch.no_grad():
+                batched_outputs = model(
+                    batched_spectrogram.to(torch_device),
+                    batched_noise_sequence.to(torch_device),
+                )[0]
+
+            self.assertEqual(
+                batched_spectrogram.shape[0],
+                batched_outputs.shape[0],
+                msg="Got different batch dims for input and output",
+            )
+
+    def test_unbatched_inputs_outputs(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            with torch.no_grad():
+                outputs = model(inputs["input_features"].to(torch_device), inputs["noise_sequence"].to(torch_device))[
+                    0
+                ]
+            self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
+
+    def test_unbatched_batched_outputs_consistency(self):
+        config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
+
+        for model_class in self.all_model_classes:
+            model = model_class(config)
+            model.to(torch_device)
+            model.eval()
+
+            unbatched_spectrogram = inputs["input_features"].detach().clone()
+            unbatched_noise_sequence = inputs["noise_sequence"].detach().clone()
+            batched_spectrogram = inputs["input_features"].unsqueeze(0)
+            batched_noise_sequence = inputs["noise_sequence"].unsqueeze(0)
+
+            with torch.no_grad():
+                unbatched_outputs = model(
+                    unbatched_spectrogram.to(torch_device),
+                    unbatched_noise_sequence.to(torch_device),
+                )[0]
+
+                batched_outputs = model(
+                    batched_spectrogram.to(torch_device),
+                    batched_noise_sequence.to(torch_device),
+                )[0]
+
+            torch.testing.assert_close(unbatched_outputs, batched_outputs)
+
+
+@require_torch_gpu
+@slow
+class UnivNetModelIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def _load_datasamples(self, num_samples, sampling_rate=24000):
+        ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
+        # automatic decoding with librispeech
+        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
+
+        return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
+
+    def get_inputs(self, device, num_samples: int = 3, noise_length: int = 10, seed: int = 0):
+        generator = torch.manual_seed(seed)
+        # Note: hardcode model_in_channels -> 64
+        if num_samples == 1:
+            noise_sequence_shape = (64, noise_length)
+        else:
+            noise_sequence_shape = (num_samples, 64, noise_length)
+        # Explicity generate noise_sequence on CPU for consistency.
+        noise_sequence = torch.randn(noise_sequence_shape, generator=generator, dtype=torch.float32, device="cpu")
+        # Put noise_sequence on the desired device.
+        noise_sequence = noise_sequence.to(device)
+
+        # Note: hardcode num_mel_channels -> 100
+        if num_samples == 1:
+            spectrogram_shape = [100, noise_length]
+        else:
+            spectrogram_shape = [num_samples, 100, noise_length]
+        spectrogram = floats_tensor(spectrogram_shape, scale=1.0, rng=random.Random(seed))
+        # Note: spectrogram should already be on torch_device
+
+        # Permute to match diffusers implementation
+        if num_samples == 1:
+            noise_sequence = noise_sequence.transpose(1, 0)
+            spectrogram = spectrogram.transpose(1, 0)
+        else:
+            noise_sequence = noise_sequence.transpose(2, 1)
+            spectrogram = spectrogram.transpose(2, 1)
+
+        inputs = {
+            "input_features": spectrogram,
+            "noise_sequence": noise_sequence,
+            "generator": generator,
+        }
+
+        return inputs
+
+    def test_model_inference_batched(self):
+        # Load sample checkpoint from Tortoise TTS
+        model = UnivNetModel.from_pretrained("dg845/univnet-dev")
+        model.eval().to(torch_device)
+
+        # Get batched noise and spectrogram inputs.
+        input_speech = self.get_inputs(torch_device, num_samples=3)
+
+        with torch.no_grad():
+            waveform = model(**input_speech)[0]
+        waveform = waveform.cpu()
+
+        waveform_mean = torch.mean(waveform)
+        waveform_stddev = torch.std(waveform)
+        waveform_slice = waveform[-1, -9:].flatten()
+
+        EXPECTED_MEAN = torch.tensor(-0.19989729)
+        EXPECTED_STDDEV = torch.tensor(0.35230172)
+        EXPECTED_SLICE = torch.tensor([-0.3408, -0.6045, -0.5052, 0.1160, -0.1556, -0.0405, -0.3024, -0.5290, -0.5019])
+
+        torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=1e-4, rtol=1e-5)
+        torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
+        torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=5e-4, rtol=1e-5)
+
+    def test_model_inference_unbatched(self):
+        # Load sample checkpoint from Tortoise TTS
+        model = UnivNetModel.from_pretrained("dg845/univnet-dev")
+        model.eval().to(torch_device)
+
+        # Get unbatched noise and spectrogram inputs.
+        input_speech = self.get_inputs(torch_device, num_samples=1)
+
+        with torch.no_grad():
+            waveform = model(**input_speech)[0]
+        waveform = waveform.cpu()
+
+        waveform_mean = torch.mean(waveform)
+        waveform_stddev = torch.std(waveform)
+        waveform_slice = waveform[-1, -9:].flatten()
+
+        EXPECTED_MEAN = torch.tensor(-0.22895093)
+        EXPECTED_STDDEV = torch.tensor(0.33986747)
+        EXPECTED_SLICE = torch.tensor([-0.3276, -0.5504, -0.3484, 0.3574, -0.0373, -0.1826, -0.4880, -0.6431, -0.5162])
+
+        torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=1e-4, rtol=1e-5)
+        torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
+        torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=1e-3, rtol=1e-5)
+
+    def test_integration(self):
+        feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
+        model = UnivNetModel.from_pretrained("dg845/univnet-dev")
+        model.eval().to(torch_device)
+
+        audio, sr = self._load_datasamples(1, sampling_rate=feature_extractor.sampling_rate)
+
+        input_features = feature_extractor(audio, sampling_rate=sr[0], return_tensors="pt").input_features
+        input_features = input_features.to(device=torch_device)
+
+        input_speech = self.get_inputs(torch_device, num_samples=1, noise_length=input_features.shape[1])
+        input_speech["input_features"] = input_features
+
+        with torch.no_grad():
+            waveform = model(**input_speech)[0]
+        waveform = waveform.cpu()
+
+        waveform_mean = torch.mean(waveform)
+        waveform_stddev = torch.std(waveform)
+        waveform_slice = waveform[-1, -9:].flatten()
+
+        EXPECTED_MEAN = torch.tensor(0.00051374)
+        EXPECTED_STDDEV = torch.tensor(0.058105603)
+        # fmt: off
+        EXPECTED_SLICE = torch.tensor([-4.3934e-04, -1.8203e-04, -3.3033e-04, -3.8716e-04, -1.6125e-04, 3.5389e-06, -3.3149e-04, -3.7613e-04, -2.3331e-04])
+        # fmt: on
+
+        torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=5e-6, rtol=1e-5)
+        torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
+        torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=5e-6, rtol=1e-5)
diff --git a/tests/models/upernet/test_modeling_upernet.py b/tests/models/upernet/test_modeling_upernet.py
index 84c32f7233e73a..aeeba191b67af2 100644
--- a/tests/models/upernet/test_modeling_upernet.py
+++ b/tests/models/upernet/test_modeling_upernet.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch UperNet framework. """
 
 
-import inspect
 import unittest
 
 from huggingface_hub import hf_hub_download
@@ -170,18 +169,6 @@ def test_config(self):
     def create_and_test_config_common_properties(self):
         return
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_for_semantic_segmentation(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_for_semantic_segmentation(*config_and_inputs)
diff --git a/tests/models/videomae/test_modeling_videomae.py b/tests/models/videomae/test_modeling_videomae.py
index 9fb9c9e7f376b2..2fd9f90c308558 100644
--- a/tests/models/videomae/test_modeling_videomae.py
+++ b/tests/models/videomae/test_modeling_videomae.py
@@ -16,7 +16,6 @@
 
 
 import copy
-import inspect
 import unittest
 
 import numpy as np
@@ -228,18 +227,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vipllava/__init__.py b/tests/models/vipllava/__init__.py
new file mode 100644
index 00000000000000..e69de29bb2d1d6
diff --git a/tests/models/vipllava/test_modeling_vipllava.py b/tests/models/vipllava/test_modeling_vipllava.py
new file mode 100644
index 00000000000000..e09527343e24fa
--- /dev/null
+++ b/tests/models/vipllava/test_modeling_vipllava.py
@@ -0,0 +1,216 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Testing suite for the PyTorch VipLlava model. """
+
+import gc
+import unittest
+
+import requests
+
+from transformers import (
+    AutoProcessor,
+    VipLlavaConfig,
+    VipLlavaForConditionalGeneration,
+    is_torch_available,
+    is_vision_available,
+)
+from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device
+
+from ...test_configuration_common import ConfigTester
+from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor
+
+
+if is_torch_available():
+    import torch
+else:
+    is_torch_greater_or_equal_than_2_0 = False
+
+if is_vision_available():
+    from PIL import Image
+
+
+# Copied from transformers.tests.models.llava.test_modeling_llava.LlavaVisionText2TextModelTester with Llava->VipLlava
+class VipLlavaVisionText2TextModelTester:
+    # Ignore copy
+    def __init__(
+        self,
+        parent,
+        ignore_index=-100,
+        image_token_index=0,
+        projector_hidden_act="gelu",
+        seq_length=7,
+        vision_feature_layers=[0, 0, 1, 1, 0],
+        text_config={
+            "model_type": "llama",
+            "seq_length": 7,
+            "is_training": True,
+            "use_input_mask": True,
+            "use_token_type_ids": False,
+            "use_labels": True,
+            "vocab_size": 99,
+            "hidden_size": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "attention_probs_dropout_prob": 0.1,
+            "max_position_embeddings": 512,
+            "type_vocab_size": 16,
+            "type_sequence_label_size": 2,
+            "initializer_range": 0.02,
+            "num_labels": 3,
+            "num_choices": 4,
+            "pad_token_id": 0,
+        },
+        is_training=True,
+        vision_config={
+            "batch_size": 12,
+            "image_size": 30,
+            "patch_size": 2,
+            "num_channels": 3,
+            "is_training": True,
+            "hidden_size": 32,
+            "projection_dim": 32,
+            "num_hidden_layers": 2,
+            "num_attention_heads": 4,
+            "intermediate_size": 37,
+            "dropout": 0.1,
+            "attention_dropout": 0.1,
+            "initializer_range": 0.02,
+        },
+    ):
+        self.parent = parent
+        self.ignore_index = ignore_index
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.vision_feature_layers = vision_feature_layers
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.seq_length = seq_length
+
+        self.num_hidden_layers = text_config["num_hidden_layers"]
+        self.vocab_size = text_config["vocab_size"]
+        self.hidden_size = text_config["hidden_size"]
+        self.num_attention_heads = text_config["num_attention_heads"]
+        self.is_training = is_training
+
+        self.batch_size = 3
+        self.num_channels = 3
+        self.image_size = 336
+        self.encoder_seq_length = 231
+
+    def get_config(self):
+        return VipLlavaConfig(
+            text_config=self.text_config,
+            vision_config=self.vision_config,
+            ignore_index=self.ignore_index,
+            image_token_index=self.image_token_index,
+            projector_hidden_act=self.projector_hidden_act,
+            vision_feature_layers=self.vision_feature_layers,
+        )
+
+    def prepare_config_and_inputs(self):
+        pixel_values = floats_tensor(
+            [
+                self.batch_size,
+                self.vision_config["num_channels"],
+                self.vision_config["image_size"],
+                self.vision_config["image_size"],
+            ]
+        )
+        config = self.get_config()
+
+        return config, pixel_values
+
+    def prepare_config_and_inputs_for_common(self):
+        config_and_inputs = self.prepare_config_and_inputs()
+        config, pixel_values = config_and_inputs
+        input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
+        attention_mask = input_ids.ne(1).to(torch_device)
+        # we are giving 3 images let's make sure we pass in 3 image tokens
+        input_ids[:, 1] = config.image_token_index
+        inputs_dict = {
+            "pixel_values": pixel_values,
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+        }
+        return config, inputs_dict
+
+
+@require_torch
+# Copied from transformers.tests.models.llava.test_modeling_llava.LlavaForConditionalGenerationModelTest with Llava->VipLlava
+class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, unittest.TestCase):
+    """
+    Model tester for `VipLlavaForConditionalGeneration`.
+    """
+
+    all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
+    fx_compatible = False
+    test_pruning = False
+    test_resize_embeddings = True
+    test_head_masking = False
+
+    def setUp(self):
+        self.model_tester = VipLlavaVisionText2TextModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=VipLlavaConfig, has_text_modality=False)
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant(self):
+        pass
+
+    @unittest.skip(
+        reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
+    )
+    def test_training_gradient_checkpointing_use_reentrant_false(self):
+        pass
+
+
+@require_torch
+class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
+    def setUp(self):
+        self.processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    @slow
+    @require_bitsandbytes
+    def test_small_model_integration_test(self):
+        model_id = "llava-hf/vip-llava-7b-hf"
+
+        model = VipLlavaForConditionalGeneration.from_pretrained(model_id, load_in_4bit=True)
+        processor = AutoProcessor.from_pretrained(model_id)
+
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
+
+        image = Image.open(requests.get(url, stream=True).raw)
+        prompt = "USER: <image>\nCan you please describe this image?\nASSISTANT:"
+
+        inputs = processor(prompt, image, return_tensors="pt").to(torch_device, torch.float16)
+
+        outputs = model.generate(**inputs, max_new_tokens=10)
+
+        EXPECTED_OUTPUT = "USER: <image> \nCan you please describe this image?\nASSISTANT: The image features a brown and white cat sitting on"
+        self.assertEqual(processor.decode(outputs[0], skip_special_tokens=True), EXPECTED_OUTPUT)
diff --git a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
index 8ac4449cf3c9c6..71b1ff71067944 100644
--- a/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
+++ b/tests/models/vision_encoder_decoder/test_modeling_vision_encoder_decoder.py
@@ -800,7 +800,7 @@ def generate_step(pixel_values):
 
         preds, scores = generate_step(pixel_values)
 
-        EXPECTED_SCORES = np.array([-0.59562886])
+        EXPECTED_SCORES = np.array([-0.5956343])
         max_diff = np.amax(np.abs(scores - EXPECTED_SCORES))
         self.assertLessEqual(max_diff, 1e-4)
 
diff --git a/tests/models/vit/test_modeling_vit.py b/tests/models/vit/test_modeling_vit.py
index d1e887183329c9..2e9a632a3719d4 100644
--- a/tests/models/vit/test_modeling_vit.py
+++ b/tests/models/vit/test_modeling_vit.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ViT model. """
 
 
-import inspect
 import unittest
 
 from transformers import ViTConfig
@@ -224,18 +223,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
index 3ea407eafd4e27..870a4c83358315 100644
--- a/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
+++ b/tests/models/vit_hybrid/test_modeling_vit_hybrid.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ViT Hybrid model. """
 
 
-import inspect
 import unittest
 
 from transformers import ViTHybridConfig
@@ -185,18 +184,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vit_mae/test_modeling_vit_mae.py b/tests/models/vit_mae/test_modeling_vit_mae.py
index 89a3a0d803e469..21a66b8a6d92a2 100644
--- a/tests/models/vit_mae/test_modeling_vit_mae.py
+++ b/tests/models/vit_mae/test_modeling_vit_mae.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ViTMAE model. """
 
 
-import inspect
 import math
 import tempfile
 import unittest
@@ -192,18 +191,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vit_msn/test_modeling_vit_msn.py b/tests/models/vit_msn/test_modeling_vit_msn.py
index a5316377515036..2125e897b9ca49 100644
--- a/tests/models/vit_msn/test_modeling_vit_msn.py
+++ b/tests/models/vit_msn/test_modeling_vit_msn.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch ViTMSN model. """
 
 
-import inspect
 import unittest
 
 from transformers import ViTMSNConfig
@@ -183,18 +182,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vitdet/test_modeling_vitdet.py b/tests/models/vitdet/test_modeling_vitdet.py
index 361e563d58d448..2df1b7925789aa 100644
--- a/tests/models/vitdet/test_modeling_vitdet.py
+++ b/tests/models/vitdet/test_modeling_vitdet.py
@@ -15,11 +15,10 @@
 """ Testing suite for the PyTorch ViTDet model. """
 
 
-import inspect
 import unittest
 
 from transformers import VitDetConfig
-from transformers.testing_utils import require_torch, torch_device
+from transformers.testing_utils import is_flaky, require_torch, torch_device
 from transformers.utils import is_torch_available
 
 from ...test_backbone_common import BackboneTesterMixin
@@ -91,6 +90,7 @@ def prepare_config_and_inputs(self):
     def get_config(self):
         return VitDetConfig(
             image_size=self.image_size,
+            pretrain_image_size=self.image_size,
             patch_size=self.patch_size,
             num_channels=self.num_channels,
             hidden_size=self.hidden_size,
@@ -175,6 +175,10 @@ def setUp(self):
         self.model_tester = VitDetModelTester(self)
         self.config_tester = ConfigTester(self, config_class=VitDetConfig, has_text_modality=False, hidden_size=37)
 
+    @is_flaky(max_attempts=3, description="`torch.nn.init.trunc_normal_` is flaky.")
+    def test_initialization(self):
+        super().test_initialization()
+
     # TODO: Fix me (once this model gets more usage)
     @unittest.skip("Does not work on the tiny model as we keep hitting edge cases.")
     def test_cpu_offload(self):
@@ -210,18 +214,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/vitmatte/test_modeling_vitmatte.py b/tests/models/vitmatte/test_modeling_vitmatte.py
index fcc99de0ba93d2..c9446b116f1e45 100644
--- a/tests/models/vitmatte/test_modeling_vitmatte.py
+++ b/tests/models/vitmatte/test_modeling_vitmatte.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch VitMatte model. """
 
 
-import inspect
 import unittest
 
 from huggingface_hub import hf_hub_download
@@ -189,18 +188,6 @@ def test_training_gradient_checkpointing_use_reentrant_false(self):
     def test_model_common_attributes(self):
         pass
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py
index e26527267fcb2f..9de3b8ff2c21b6 100644
--- a/tests/models/whisper/test_modeling_whisper.py
+++ b/tests/models/whisper/test_modeling_whisper.py
@@ -17,6 +17,7 @@
 import copy
 import inspect
 import os
+import random
 import tempfile
 import time
 import unittest
@@ -47,7 +48,7 @@
 
 if is_datasets_available():
     import datasets
-    from datasets import load_dataset
+    from datasets import Audio, load_dataset
 
 if is_torch_available():
     import torch
@@ -61,8 +62,81 @@
         WhisperProcessor,
         set_seed,
     )
+    from transformers.generation.logits_process import LogitsProcessor
     from transformers.models.whisper.modeling_whisper import WhisperDecoder, WhisperEncoder, sinusoids
 
+    class DummyTimestampLogitProcessor(LogitsProcessor):
+        """This processor fakes the correct timestamps tokens pattern [TOK_1] [TOK_2] ... [TOK_N] [TIME_STAMP_TOK_1] [TIME_STAMP_TOK_2] [TOK_N+1] ..."""
+
+        def __init__(
+            self, timestamp_begin, vocab_size, batch_size, max_length, min_space=3, seed=0, is_length_ascending=True
+        ):
+            self.timestamp_begin = timestamp_begin
+            self.vocab_size = vocab_size
+
+            self.min_space_between_timestamps = min_space
+            self.timestamp_tokens = torch.arange(self.timestamp_begin, self.vocab_size)
+            self.timestamp_tokens.to(torch_device)
+            self.is_length_ascending = is_length_ascending
+
+            self.no_time_stamp_counter = batch_size * [0]
+            self.prev_highest_timestamp = batch_size * [0]
+            self.batch_size = batch_size
+            self.max_length = max_length
+            self.count = 0
+
+            self.let_pass = [[] for _ in range(batch_size)]
+            for k in range(batch_size):
+                random.seed(seed + k)
+                for _ in range(10000):
+                    self.let_pass[k].append(random.randint(1, 10) <= 3)
+
+        def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+            # we don't want to randomely sample timestamp tokens
+            if input_ids.shape[-1] > 1:
+                scores[:, self.timestamp_begin :] = -float("inf")
+
+            self.no_time_stamp_counter = [x + 1 for x in self.no_time_stamp_counter]
+            for k in range(input_ids.shape[0]):
+                # make sure to use correct index if a batch was removed
+                if self.is_length_ascending and input_ids.shape[0] < self.batch_size:
+                    prev_k = k + self.batch_size - input_ids.shape[0]
+                else:
+                    prev_k = k
+
+                if input_ids[k, -1] == self.timestamp_begin:
+                    self.no_time_stamp_counter[prev_k] = 0
+
+                can_produce = self.no_time_stamp_counter[prev_k] > self.min_space_between_timestamps
+                must_produce = (
+                    input_ids[k][2:].le(self.timestamp_begin).all() and input_ids.shape[-1] == self.max_length - 1
+                )
+                # produce timestamp with 30%
+                if (can_produce and self.let_pass[prev_k][self.count]) or must_produce:
+                    self.no_time_stamp_counter[prev_k] = 0
+                    self.prev_highest_timestamp[prev_k] = max(input_ids[k].max() + 1, self.timestamp_tokens[0].item())
+
+                    # force a timestamp
+                    scores[k, :] = -float("inf")
+                    scores[k, self.prev_highest_timestamp[prev_k]] = 10.0
+
+                if (
+                    input_ids.shape[-1] > 3
+                    and input_ids[k, -1].item() in self.timestamp_tokens
+                    and input_ids[k, -2].item() not in self.timestamp_tokens
+                ):
+                    # force the same as before
+                    scores[k, :] = -float("inf")
+                    scores[k, input_ids[k, -1].item()] = 10.0
+
+            self.count += 1
+
+            if torch.isinf(scores).all():
+                raise ValueError("Dummy logit processor is incorrectly set up. Scores should not be all inf.")
+
+            return scores
+
+
 if is_flax_available():
     import jax.numpy as jnp
 
@@ -293,6 +367,7 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
             "audio-classification": WhisperForAudioClassification,
             "automatic-speech-recognition": WhisperForConditionalGeneration,
             "feature-extraction": WhisperModel,
+            "text-generation": WhisperForCausalLM,
         }
         if is_torch_available()
         else {}
@@ -816,12 +891,13 @@ def test_flash_attn_2_inference(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                    tmpdirname,
+                    torch_dtype=torch.bfloat16,
                 )
                 model.to(torch_device)
 
@@ -861,11 +937,11 @@ def test_flash_attn_2_inference_padding_right(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
-                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16)
                 model.to(torch_device)
 
                 dummy_input = inputs_dict[model.main_input_name][:1]
@@ -906,6 +982,7 @@ def _create_and_check_torchscript(self, config, inputs_dict):
 
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
+        configs_no_init._attn_implementation = "eager"
         for model_class in self.all_model_classes:
             model = model_class(config=configs_no_init)
             model.to(torch_device)
@@ -1237,6 +1314,133 @@ def test_generate_with_prompt_ids_max_length(self):
 
         model.generate(input_features, max_new_tokens=1, prompt_ids=prompt_ids)
 
+    def test_longform_generate_single_batch(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"]
+
+        # len = 250 with num_input_frames = 60
+        long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
+
+        # force bsz=1
+        long_input_features = long_input_features[:1]
+        vocab_size = model.config.vocab_size
+
+        batch_size = 1
+        num_timestamp_tokens = 20
+        max_length = 16
+        logits_processor = [
+            DummyTimestampLogitProcessor(
+                vocab_size - num_timestamp_tokens,
+                vocab_size,
+                batch_size=batch_size,
+                max_length=max_length,
+                min_space=4,
+            )
+        ]
+
+        # each chunk should not be longer than 10
+        model.generation_config.max_length = max_length
+
+        # if input features are long can't set return_timestamps to False
+        with self.assertRaises(ValueError):
+            _ = model.generate(long_input_features, logits_processor=logits_processor, return_timestamps=False)
+
+        # if input features are long need to set generation config
+        with self.assertRaises(ValueError):
+            _ = model.generate(long_input_features, logits_processor=logits_processor)
+
+        timestamp_begin = vocab_size - num_timestamp_tokens
+        model.generation_config.no_timestamps_token_id = timestamp_begin - 1
+        model.generation_config.eos_token_id = None
+        model.generation_config._detect_timestamp_from_logprob = False
+        # make sure that we only have the same begin token
+        model.generation_config.max_initial_timestamp_index = 0
+
+        outputs = model.generate(long_input_features, logits_processor=logits_processor, return_segments=True)
+
+        segments = outputs["segments"][0]
+
+        for i, segment in enumerate(segments):
+            assert segment["start"] <= segment["end"], "start has to be smaller equal end"
+            assert (
+                segment["tokens"][0] == model.generation_config.decoder_start_token_id
+                or segment["tokens"][0] >= timestamp_begin
+            ), "First segment token should be a timestamp token"
+            assert any(
+                s > timestamp_begin for s in segment["tokens"][1:]
+            ), f"At least one segment token should be a timestamp token, but not first., {segment['tokens']}"
+            assert (
+                segment["tokens"].shape[-1] <= max_length
+            ), "make sure that no segment is larger than max generation length"
+
+    def test_longform_generate_multi_batch(self):
+        config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+        model = WhisperForConditionalGeneration(config).eval().to(torch_device)
+        input_features = input_dict["input_features"].to(torch_device)
+
+        # len = 250 with num_input_frames = 60
+        long_input_features = torch.cat([input_features.repeat(1, 1, 4), input_features[:, :, :10]], dim=-1)
+        long_input_features[:1, :, :200]
+        input_features_2 = long_input_features[1:]
+        attention_mask = torch.ones(
+            (2, long_input_features.shape[-1]), dtype=input_features.dtype, device=input_features.device
+        )
+        attention_mask[0, 200:] = 0
+
+        # force bsz=1
+        vocab_size = model.config.vocab_size
+
+        batch_size = 1
+        num_timestamp_tokens = 20
+        max_length = 16
+        timestamp_begin = vocab_size - num_timestamp_tokens
+        model.generation_config.no_timestamps_token_id = timestamp_begin - 1
+        model.generation_config.eos_token_id = None
+        model.generation_config._detect_timestamp_from_logprob = False
+        # make sure that we only have the same begin token
+        model.generation_config.max_initial_timestamp_index = 0
+
+        logits_processor = [
+            DummyTimestampLogitProcessor(
+                vocab_size - num_timestamp_tokens,
+                vocab_size,
+                batch_size=batch_size,
+                max_length=max_length,
+                min_space=4,
+                seed=1,
+            )
+        ]
+        outputs_2 = model.generate(input_features_2, logits_processor=logits_processor, return_segments=True)
+        tokens_2 = outputs_2["sequences"][0]
+        segments_2 = outputs_2["segments"][0]
+
+        batch_size = 2
+        logits_processor = [
+            DummyTimestampLogitProcessor(
+                vocab_size - num_timestamp_tokens,
+                vocab_size,
+                batch_size=batch_size,
+                max_length=max_length,
+                min_space=4,
+                seed=0,
+            )
+        ]
+        outputs = model.generate(
+            long_input_features, attention_mask=attention_mask, logits_processor=logits_processor, return_segments=True
+        )
+        tokens = outputs["sequences"][1]
+        segments = outputs["segments"][1]
+
+        assert tokens_2.tolist() == tokens.tolist()
+
+        for seg1, seg2 in zip(segments_2, segments):
+            assert seg1["start"] == seg2["start"]
+            assert seg1["end"] == seg2["end"]
+            assert seg1["tokens"].tolist() == seg2["tokens"].tolist()
+
 
 @require_torch
 @require_torchaudio
@@ -1831,6 +2035,125 @@ def test_speculative_decoding_non_distil(self):
         ]
         assert total_time_non_assist > total_time_assist, "Make sure that assistant decoding is faster"
 
+    @slow
+    def test_whisper_longform_single_batch(self):
+        # fmt: off
+        EXPECTED_TEXT = [' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter\'s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton\'s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell\'s pictures are a sort of up-gards and atom paintings, and Mason\'s exquisite idles are as national as a jingo poem. Mr. Birk at Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon\'s body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered his muscles into complete relaxation. Oli\'s heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I\'m here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you\'re being a fool. out, through his silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon\'s softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent\'s face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that\'s rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone, and gone for good," answered Polychrom, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with says he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn\'t work too hard, said Shaggy. He doesn\'t work at all. In fact, there\'s nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we\'ve turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I\'m quite sure he didn\'t. That\'s funny, remarked Betsy thoughtfully. I don\'t believe Anne knew any magic, or she\'d have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgato\'s discarded ruby crown and holding in his hand to scepter which reggative head so often thrown at his head.']
+        # fmt: on
+
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model = model.to("cuda")
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
+        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
+
+        input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[
+            "input_features"
+        ]
+        input_features = input_features.to(device="cuda")
+
+        result = model.generate(input_features, return_timestamps=True)
+        decoded = processor.batch_decode(result, skip_special_tokens=True)
+
+        assert decoded == EXPECTED_TEXT
+
+    @slow
+    def test_whisper_longform_multi_batch(self):
+        # fmt: off
+        EXPECTED_TEXT_1 = [" Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birkett Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing a poster or near the fire, and the ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. a Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the tight-wing cloth that was the only germany war. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oily his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, knights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenty's he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds he asked the handler who was needing his aching muscles. a red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were andextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. I rolled the mazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue, pre-inscented and new to fifth point was his. Then the powerful twist that's rest of the side, in and under the guard, because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, a cooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, return Calico. Where is my brother now? choir-dshaggy, in the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh, no, I'm quite sure he didn't. That's funny, remarked Betsy thoughtfully. I don't believe and knew any magic, or she'd have worked it before. I do not know, confess shaggy. True, a great calico. Calico went to the big gong and pounded on it, just as Virgado used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgados discarded Ruby Crown, and holding in his hand to scepter, which Virgado had so often thrown at his head. head."]
+        EXPECTED_TEXT_2 = [" Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton's work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-gards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Burkett Foster's landscapes smile at one much in the same way that Mr. Carker."]
+        EXPECTED_TEXT_3 = [" possible. Nor is Mr. Quilter's manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grieved doubts whether Sir Frederick Layton's work is really greek after all, and can discover in it but little of rocky Ithaca. Linnell's pictures are a sort of up-guards and atom paintings, and Mason's exquisite idles are as national as a jingo poem. Mr. Birk at Foster's landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath, next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. Under general principles of art, Mr. Quilter writes with equal lucidity. Painting, he tells us, is of a different quality to mathematics and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Mix a customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire. any ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man, and remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon's body trickling into the titling cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes. Even to soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered as muscles into complete relaxation. Oily his heart and lungs worked on at a strong measured rate. He was in In reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, the thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency clearly used to command. I'm here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenty's he must have drawn his gun, because the intruder said quickly, but that away you're being a fool. Out there was silence then, and still wondering, Breon was once more asleep. Ten seconds he asked the handler who was needing his aching muscles. a red-haired mountain of a man with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing, just thrust and parry and victory to the stronger. Every man who entered the twenties had his own training tricks. There appeared to be an immediate association with the death trauma as if the two were andextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the twenties and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon's softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent's face when the man finally recognized his error. A wave of despair rolled out from our rogue, re-insunced it and knew the fifth point was his. Then the powerful twist that's rest of the side, in and under the guard, because you were sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, a cooing dove. He has gone and gone for good, answered Polychrome, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn't work too hard, since Shaggy. He doesn't work at all. In fact, there's nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we've turned Calico. Where is my brother now? quared shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I'm quite sure he didn't. And that's funny, remarked Betsy thoughtfully. I don't believe Anne knew any magic, or she'd have worked it before. I do not know, confess Shaggy. True, a great calico. Calico went to the big gong and pounded on it, just as we're good to have used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the thrown wearing ruggedos discarded ruby crown and holding in his hand to septor which Ruggato had so often thrown at his head."]
+        EXPECTED_TEXT_4 = [' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel. Nor is Mr. Quilter\'s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and roast beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton\'s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell\'s pictures are a sort of up-gards and atom paintings, and Mason\'s exquisite idles are as national as a jingo poem. Mr. Birk at Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampoo or a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate an expression. On the general principles of art, Mr. Quilter writes with equal lucidity. he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. Makes the customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer. Near the fire, any ornaments Fred brought home from India on the mantelboard. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. And remarks was pleasing courtesy in Felicitis Grace that many faces are feeling. Only, unfortunately, his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the Tupper of painting. By Harry Quilter M.A. A man said to the universe, Sir, I exist. Sweat-covered Breon\'s body trickling into the tight-lowing cloth that was the only german he wore. The cut on his chest still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, retrovealities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were triggered his muscles into complete relaxation. Oli\'s heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the twenties needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied, The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I\'m here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The twenties, he must have drawn his gun because the intruder said quickly, but that away you\'re being a fool. out, through his silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. man who entered the twenties had his own training tricks. They were appeared to be an immediate association with the death trauma, as if the two were inextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported except at two points, the head and heels. This is physically impossible when conscious. had died before during the 20s and death during the last round was in some ways easier than defeat. Breathing deeply, Breon\'s softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent\'s face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that\'s rested aside, in and under the guard, because he was sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accooing dove. He has gone, and gone for good," answered Polychrom, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with says he stepped forward and burst the stout chains as easily as if they had been threads. The little girl had been asleep, but she heard the wraps and opened the door. The king has flooded disgrace, and your friends are asking for you. I begged Ruggadot long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn\'t work too hard, said Shaggy. He doesn\'t work at all. In fact, there\'s nothing he can do in these dominions as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we\'ve turned Calico. Where is my brother now, inquired Shaggy. In the metal forest. Where is that? The middle forest is in the great domed cavern, the largest and all-ard dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I\'m quite sure he didn\'t. That\'s funny, remarked Betsy thoughtfully. I don\'t believe Anne knew any magic, or she\'d have worked it before. I do not know, confess Shaggy. True, agreed Calico. Calico went to the big gong and pounded on it just as Virgato used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Virgato\'s discarded ruby crown and holding in his hand to scepter which reggative head so often thrown at his head.']
+        # fmt: on
+
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model = model.to("cuda")
+
+        ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean")
+        one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32)
+        audios = []
+        audios.append(one_audio[110000:])
+        audios.append(one_audio[:800000])
+        audios.append(one_audio[80000:])
+        audios.append(one_audio[:])
+
+        decoded_single = []
+        for audio in audios:
+            inputs = processor(audio, return_tensors="pt", truncation=False)
+            inputs = inputs.to(device="cuda")
+
+            result = model.generate(**inputs, return_timestamps=True)
+            decoded_single.append(processor.batch_decode(result, skip_special_tokens=True))
+
+        inputs = processor(
+            audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
+        )
+        inputs = inputs.to(device="cuda")
+
+        result = model.generate(**inputs, return_timestamps=True)
+        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
+
+        # make sure single & batch is exactly the same
+        assert decoded_all[0:1] == decoded_single[0]
+        assert decoded_all[1:2] == decoded_single[1]
+        assert decoded_all[2:3] == decoded_single[2]
+        assert decoded_all[3:4] == decoded_single[3]
+
+        # exact match
+        assert decoded_all[0:1] == EXPECTED_TEXT_1
+        assert decoded_all[1:2] == EXPECTED_TEXT_2
+        assert decoded_all[2:3] == EXPECTED_TEXT_3
+        assert decoded_all[3:4] == EXPECTED_TEXT_4
+
+    @slow
+    def test_whisper_longform_multi_batch_hard(self):
+        # fmt: off
+        EXPECTED_TEXT = [
+            " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile!",
+            " Folks, I spend a lot of time right over there, night after night after night, actually. Carefully selecting for you the day's noosiest, most aerodynamic headlines, stress testing, and those topical anti-lock breaks and power steering, painstakingly stitching, leather seating so soft, it would make JD power and her associates blush to create the luxury sedan that is my nightly monologue. But sometimes, you sometimes, folks. I lurched a consciousness in the back of an abandoned school and slap myself awake with a crusty floor mat. Before using a mouse-bitten timing belt to strap some old plywood to a couple of discarded oil drums, then by the light of a heathen moon, render a gas tank out of an empty big gulp, fill with white claw and denatured alcohol, then light a match and let her rip and the demented one man soapbox derby of news that is my segment. Me, Guadalupe! No!",
+            " Ladies and gentlemen, you know, I spent a lot of time right over there Raising the finest Holstein news cattle firmly yet tenderly milking the latest headlines from their jokes swollen teats Churning the daily stories into the decadent proven-style style triple cream breed that is my nightly monologue But sometimes sometimes folks I stagger home hungry after being released by the police and Root around in the neighbor's trash can for an old milk carton scrape out the blooming dairy residue into the remains of a wet cheese rod I won from a rat in a pre-donned street fight. Put it in a discarded paint can to leave it to ferment next to a trash fire then hunker down and hallucinate while eating the listeria laden demon custard of news that is my segment. You mean one of them.",
+            " Folks, if you watch this show, you know I spend most of my time right over there carefully sorting through the day's biggest stories and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the Ichol Gregoire Ferrandi, who carefully dye them in a palette of bright zesty shades and adorn them in the finest and most topical inlay work using hand tools and double magnifying glasses, then assemble them according to now classic and elegant geometry using our signature saddles stitching. In line it with bees, wax, coated linen, finely attached a mallet, hammered strap, pearled hardware, and close-shit to create for you the one-of-a-kind hoke couture, Erme's Birkin bag that is my monologue. But sometimes, sometimes folks, sometimes. Sometimes I wake up in the last car of an abandoned roller coaster at Coney Island where I'm I'm hiding from the triads. I have some engine lubricants out of a safe way bag and stagger down the shore to tear the sail off a beach schooner. Then I rip the coaxial cable out of an RV and elderly couple from Utah, Hank, and Mabel lovely folks. And use it to stitch the sail into a loose pouch like a rock sack. And I stow away in the back of a garbage truck to the junkyard where I pick through to the debris for only the broken toys that make me the saddest until I have loaded for you. The Hobo Fugitives bug out, bindle of news that is my segment. Me one!",
+            " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's biggest stories right over there. Meticulously selecting the most topical chakra affirming scented candles, and using Feng Shui to perfectly align the joke energy in the exclusive boutique yoga retreat that is my monologue. But sometimes just sometimes I go to the dumpster behind the waffle house at three in the morning, take off my shirt, cover myself, and used fry oil, wrap my hands with some double-duct tape by stole from the broken car window. Pound a six-pack of blueberry hard-seltzer and a sack of pills I stole from a parked ambulance. Then arm wrestle a raccoon in the back alley vision quest of news that is my segment. Meanwhile!",
+            " You know, folks, I spend most of my time right over there. Mining the day's biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels. Then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest and glory. Then, using the Germanic tradition press-black process, I place thin sheets of foil against the scenes and by hammering or otherwise applying pressure from the back, I project these scenes into a pair of cheat cards in a faceplate and, finally, using fluted strips of white alloyed molding, I divide the designs into framed panels and hold it all together using bronze rivets to create the beautiful and intimidating, Anglo-Saxon battle helm that is my nightly monologue. Sometimes, sometimes folks. Sometimes, just sometimes, I come into my sense as fully naked on the deck of a pirate besieged melee container ship that picked me up floating on the detached door of a portapotty in the Indian Ocean. Then after a sunstroke-induced realization of the crew of this ship plans to sell me an exchange for a bag of oranges to fight off scurvy, I lead a mutiny using only a PVC pipe at a pool chain that accepting my new role as Captain and declaring myself king of the windarc seas. I grab a dirty mop bucket covered in barnacles and adorn it with the teeth of the vanquished to create the sopping wet pirate crown of news that is my segment. Meanwhile!",
+            " Folks, if you watch this show, you know I spend most of my time right over there carefully blending for you the day's Newsiest most topical flower eggs milk and butter and Stranding into a fine batter to make delicate and informative comedy pancakes Then I glaze them in the juice and zest of the most relevant midnight Valencia oranges and douse it all and a fine Dela main de voyage cognac Before prom baying and basting them tables. I deserve for you the James Beard award worthy crepe suzzette That is my nightly monologue, but sometimes just sometimes folks. I wake up in the baggage hold of Greyhound bus. It's being hoisted by the scrap yard claw toward the burn pit. Escape to a nearby abandoned price chopper where I scrounge for old bread scraps and busted open bags of starfruit candies and expired eggs. Chuck it all on a dirty hubcap and slap it over a tire fire before using the legs of a strain, pair of sweatpants and as oven mitts to extract and serve the demented transience poundcake of news that is my segment. Me, Guadalupe!",
+            " Folks, if you watched the show and I hope you do, I spent a lot of time right over there. Tiredlessly studying the lineage of the days most important thoroughbred stories and whole-stiner headlines, working with the best trainers, money can buy to rear their comedy offspring with a hand that is stern yet gentle into the triple crown winning equine specimen. That is my nightly monologue, but sometimes, sometimes, folks, I break into an unincorporated veterinary genetics lab and grab whatever test tubes I can find and then under a grow light I got from a discarded chia pet. I mixed the pilfered DNA of a horse and whatever was in a tube labeled Keith Colan extra. Slurrying the concoction with caffeine pills and a microwave red bull, I screamed, sang a prayer to Janice, initiator of human life and God of transformation as a half horse, half man, freak. Seizes to life before me and the hideous collection of loose animal parts and corrupted man tissue that is my segment. Meanwhile!",
+        ]
+        # fmt: on
+
+        processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model = model.to("cuda")
+
+        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+
+        num_samples = 8
+
+        audio = ds[:num_samples]["audio"]
+        audios = [x["array"] for x in audio]
+
+        decoded_single = []
+        for audio in audios:
+            inputs = processor(audio, return_tensors="pt", truncation=False, sampling_rate=16_000)
+            inputs = inputs.to(device="cuda")
+
+            result = model.generate(**inputs, return_timestamps=True)
+            decoded_single += processor.batch_decode(result, skip_special_tokens=True)
+
+        inputs = processor(
+            audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True
+        )
+        inputs = inputs.to(device="cuda")
+
+        result = model.generate(**inputs, return_timestamps=True)
+        decoded_all = processor.batch_decode(result, skip_special_tokens=True)
+
+        for i in range(num_samples):
+            assert decoded_all[i] == decoded_single[i]
+            assert decoded_all[i] == EXPECTED_TEXT[i]
+
 
 def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None):
     if head_mask is None:
@@ -2016,13 +2339,20 @@ def test_encoder_outputs(self):
             with torch.no_grad():
                 outputs = model(**inputs)[0]
 
-            input_ids = inputs["input_features"]
+            encoder = model.encoder
+
+            encoder_inputs = {"input_features": inputs["input_features"]}
             del inputs["input_features"]
 
-            encoder = model.encoder
+            if "head_mask" in inputs:
+                encoder_inputs["head_mask"] = inputs["head_mask"]
+            if "attention_mask" in inputs:
+                encoder_inputs["attention_mask"] = inputs["attention_mask"]
+            if "output_attentions" in inputs:
+                encoder_inputs["output_attentions"] = inputs["output_attentions"]
 
             with torch.no_grad():
-                inputs["encoder_outputs"] = encoder(input_ids)
+                inputs["encoder_outputs"] = encoder(**encoder_inputs)
                 outputs_embeds = model(**inputs)[0]
 
             self.assertTrue((outputs_embeds == outputs).all())
diff --git a/tests/models/xglm/test_modeling_xglm.py b/tests/models/xglm/test_modeling_xglm.py
index 457317f0783681..e482b1b384f3ee 100644
--- a/tests/models/xglm/test_modeling_xglm.py
+++ b/tests/models/xglm/test_modeling_xglm.py
@@ -357,10 +357,6 @@ def test_model_from_pretrained(self):
     def test_model_parallelism(self):
         super().test_model_parallelism()
 
-    @unittest.skip("This test is currently broken because of safetensors.")
-    def test_tf_from_pt_safetensors(self):
-        pass
-
 
 @require_torch
 class XGLMModelLanguageGenerationTest(unittest.TestCase):
diff --git a/tests/models/yolos/test_image_processing_yolos.py b/tests/models/yolos/test_image_processing_yolos.py
index 003a00611059cf..1039e4c91bc6e1 100644
--- a/tests/models/yolos/test_image_processing_yolos.py
+++ b/tests/models/yolos/test_image_processing_yolos.py
@@ -21,7 +21,7 @@
 from transformers.testing_utils import require_torch, require_vision, slow
 from transformers.utils import is_torch_available, is_vision_available
 
-from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
+from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs
 
 
 if is_torch_available():
@@ -127,7 +127,7 @@ def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=F
 
 @require_torch
 @require_vision
-class YolosImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
+class YolosImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase):
     image_processing_class = YolosImageProcessor if is_vision_available() else None
 
     def setUp(self):
diff --git a/tests/models/yolos/test_modeling_yolos.py b/tests/models/yolos/test_modeling_yolos.py
index c1fb50e30b766b..73281df02560d0 100644
--- a/tests/models/yolos/test_modeling_yolos.py
+++ b/tests/models/yolos/test_modeling_yolos.py
@@ -15,7 +15,6 @@
 """ Testing suite for the PyTorch YOLOS model. """
 
 
-import inspect
 import unittest
 
 from transformers import YolosConfig
@@ -217,18 +216,6 @@ def test_model_common_attributes(self):
             x = model.get_output_embeddings()
             self.assertTrue(x is None or isinstance(x, nn.Linear))
 
-    def test_forward_signature(self):
-        config, _ = self.model_tester.prepare_config_and_inputs_for_common()
-
-        for model_class in self.all_model_classes:
-            model = model_class(config)
-            signature = inspect.signature(model.forward)
-            # signature.parameters is an OrderedDict => so arg_names order is deterministic
-            arg_names = [*signature.parameters.keys()]
-
-            expected_arg_names = ["pixel_values"]
-            self.assertListEqual(arg_names[:1], expected_arg_names)
-
     def test_model(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_model(*config_and_inputs)
diff --git a/tests/pipelines/test_pipelines_automatic_speech_recognition.py b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
index b5dee1e00fc9e1..2ccaf71255670e 100644
--- a/tests/pipelines/test_pipelines_automatic_speech_recognition.py
+++ b/tests/pipelines/test_pipelines_automatic_speech_recognition.py
@@ -16,7 +16,7 @@
 
 import numpy as np
 import pytest
-from datasets import load_dataset
+from datasets import Audio, load_dataset
 from huggingface_hub import hf_hub_download, snapshot_download
 
 from transformers import (
@@ -329,16 +329,16 @@ def test_return_timestamps_in_preprocess(self):
         self.assertEqual(
             res,
             {
-                "text": " Conquered returned to its place amidst the tents.",
-                "chunks": [
-                    {"text": " Conquered", "timestamp": (0.5, 1.2)},
-                    {"text": " returned", "timestamp": (1.2, 1.64)},
-                    {"text": " to", "timestamp": (1.64, 1.84)},
-                    {"text": " its", "timestamp": (1.84, 2.02)},
-                    {"text": " place", "timestamp": (2.02, 2.28)},
-                    {"text": " amidst", "timestamp": (2.28, 2.78)},
-                    {"text": " the", "timestamp": (2.78, 2.96)},
-                    {"text": " tents.", "timestamp": (2.96, 3.48)},
+                'text': ' Conquered returned to its place amidst the tents.',
+                'chunks': [
+                    {'text': ' Conquered', 'timestamp': (0.5, 1.2)},
+                    {'text': ' returned', 'timestamp': (1.2, 1.64)},
+                    {'text': ' to', 'timestamp': (1.64, 1.84)},
+                    {'text': ' its', 'timestamp': (1.84, 2.02)},
+                    {'text': ' place', 'timestamp': (2.02, 2.28)},
+                    {'text': ' amidst', 'timestamp': (2.28, 2.8)},
+                    {'text': ' the', 'timestamp': (2.8, 2.98)},
+                    {'text': ' tents.', 'timestamp': (2.98, 3.48)},
                 ],
             },
         )
@@ -776,27 +776,27 @@ def test_simple_whisper_asr(self):
         self.assertEqual(
             output,
             {
-                "text": " Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.",
-                "chunks": [
-                    {'text': ' Mr.', 'timestamp': (0.0, 1.02)},
-                    {'text': ' Quilter', 'timestamp': (1.02, 1.18)},
+                'text': ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
+                'chunks': [
+                    {'text': ' Mr.', 'timestamp': (0.38, 1.04)},
+                    {'text': ' Quilter', 'timestamp': (1.04, 1.18)},
                     {'text': ' is', 'timestamp': (1.18, 1.44)},
                     {'text': ' the', 'timestamp': (1.44, 1.58)},
                     {'text': ' apostle', 'timestamp': (1.58, 1.98)},
-                    {'text': ' of', 'timestamp': (1.98, 2.3)},
-                    {'text': ' the', 'timestamp': (2.3, 2.46)},
+                    {'text': ' of', 'timestamp': (1.98, 2.32)},
+                    {'text': ' the', 'timestamp': (2.32, 2.46)},
                     {'text': ' middle', 'timestamp': (2.46, 2.56)},
-                    {'text': ' classes,', 'timestamp': (2.56, 3.38)},
-                    {'text': ' and', 'timestamp': (3.38, 3.52)},
-                    {'text': ' we', 'timestamp': (3.52, 3.6)},
-                    {'text': ' are', 'timestamp': (3.6, 3.72)},
+                    {'text': ' classes,', 'timestamp': (2.56, 3.4)},
+                    {'text': ' and', 'timestamp': (3.4, 3.54)},
+                    {'text': ' we', 'timestamp': (3.54, 3.62)},
+                    {'text': ' are', 'timestamp': (3.62, 3.72)},
                     {'text': ' glad', 'timestamp': (3.72, 4.0)},
                     {'text': ' to', 'timestamp': (4.0, 4.26)},
-                    {'text': ' welcome', 'timestamp': (4.26, 4.54)},
-                    {'text': ' his', 'timestamp': (4.54, 4.92)},
-                    {'text': ' gospel.', 'timestamp': (4.92, 6.66)},
-                ],
-            },
+                    {'text': ' welcome', 'timestamp': (4.26, 4.56)},
+                    {'text': ' his', 'timestamp': (4.56, 4.92)},
+                    {'text': ' gospel.', 'timestamp': (4.92, 5.84)}
+                ]
+            }
         )
         # fmt: on
 
@@ -1087,6 +1087,51 @@ def test_with_local_lm_fast(self):
         self.assertEqual(output, [{"text": ANY(str)}])
         self.assertEqual(output[0]["text"][:6], "<s> <s")
 
+    @require_torch
+    @slow
+    def test_whisper_longform(self):
+        # fmt: off
+        EXPECTED_RESULT = """ Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out of fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct denny's, set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile!"""
+        # fmt: on
+
+        processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        model = model.to("cuda")
+
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model=model,
+            tokenizer=processor.tokenizer,
+            feature_extractor=processor.feature_extractor,
+            max_new_tokens=128,
+            device="cuda:0",
+        )
+
+        ds = load_dataset("distil-whisper/meanwhile", "default")["test"]
+        ds = ds.cast_column("audio", Audio(sampling_rate=16000))
+        audio = ds[:1]["audio"]
+
+        result = pipe(audio)[0]["text"]
+
+        assert result == EXPECTED_RESULT
+
+    @require_torch
+    @slow
+    def test_seamless_v2(self):
+        pipe = pipeline(
+            "automatic-speech-recognition",
+            model="facebook/seamless-m4t-v2-large",
+            device="cuda:0",
+        )
+
+        dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        sample = dataset[0]["audio"]
+
+        result = pipe(sample, generate_kwargs={"tgt_lang": "eng"})
+        EXPECTED_RESULT = "mister quilter is the apostle of the middle classes and we are glad to welcome his gospel"
+
+        assert result["text"] == EXPECTED_RESULT
+
     @require_torch
     @slow
     def test_chunking_and_timestamps(self):
@@ -1355,7 +1400,7 @@ def test_slow_unfinished_sequence(self):
             out,
             {
                 "chunks": [
-                    {"text": "", "timestamp": (18.94, 0.0)},
+                    {"text": "", "timestamp": (18.94, 0.02)},
                     {"text": "मिर्ची में कितने विभिन्न प्रजातियां हैं", "timestamp": (None, None)},
                 ],
                 "text": "मिर्ची में कितने विभिन्न प्रजातियां हैं",
diff --git a/tests/pipelines/test_pipelines_fill_mask.py b/tests/pipelines/test_pipelines_fill_mask.py
index c85797fbb6eb4a..571b320d617fa1 100644
--- a/tests/pipelines/test_pipelines_fill_mask.py
+++ b/tests/pipelines/test_pipelines_fill_mask.py
@@ -216,15 +216,24 @@ def run_large_test(self, unmasker):
             ],
         )
 
+        dummy_str = "Lorem ipsum dolor sit amet, consectetur adipiscing elit," * 100
         outputs = unmasker(
-            "My name is <mask>" + "Lorem ipsum dolor sit amet, consectetur adipiscing elit," * 100,
+            "My name is <mask>" + dummy_str,
             tokenizer_kwargs={"truncation": True},
         )
+        simplified = nested_simplify(outputs, decimals=4)
         self.assertEqual(
-            nested_simplify(outputs, decimals=6),
+            [{"sequence": x["sequence"][:100]} for x in simplified],
             [
-                {"sequence": "My name is grouped", "score": 2.2e-05, "token": 38015, "token_str": " grouped"},
-                {"sequence": "My name is accuser", "score": 2.1e-05, "token": 25506, "token_str": " accuser"},
+                {"sequence": f"My name is,{dummy_str}"[:100]},
+                {"sequence": f"My name is:,{dummy_str}"[:100]},
+            ],
+        )
+        self.assertEqual(
+            [{k: x[k] for k in x if k != "sequence"} for x in simplified],
+            [
+                {"score": 0.2819, "token": 6, "token_str": ","},
+                {"score": 0.0954, "token": 46686, "token_str": ":,"},
             ],
         )
 
diff --git a/tests/pipelines/test_pipelines_image_to_text.py b/tests/pipelines/test_pipelines_image_to_text.py
index 7514f17919b1f0..4f5b5780277bfc 100644
--- a/tests/pipelines/test_pipelines_image_to_text.py
+++ b/tests/pipelines/test_pipelines_image_to_text.py
@@ -252,3 +252,24 @@ def test_large_model_tf(self):
                 [{"generated_text": "a cat laying on a blanket next to a cat laying on a bed "}],
             ],
         )
+
+    @slow
+    @require_torch
+    def test_conditional_generation_llava(self):
+        pipe = pipeline("image-to-text", model="llava-hf/bakLlava-v1-hf")
+        url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+
+        prompt = (
+            "<image>\nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT:"
+        )
+
+        outputs = pipe(image, prompt=prompt, generate_kwargs={"max_new_tokens": 200})
+        self.assertEqual(
+            outputs,
+            [
+                {
+                    "generated_text": "<image> \nUSER: What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud?\nASSISTANT: Lava"
+                }
+            ],
+        )
diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py
index b9a5febb560919..dc77204f3e13d1 100644
--- a/tests/pipelines/test_pipelines_text_generation.py
+++ b/tests/pipelines/test_pipelines_text_generation.py
@@ -242,7 +242,12 @@ def run_pipeline_test(self, text_generator, _):
         # We don't care about infinite range models.
         # They already work.
         # Skip this test for XGLM, since it uses sinusoidal positional embeddings which are resized on-the-fly.
-        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = ["RwkvForCausalLM", "XGLMForCausalLM", "GPTNeoXForCausalLM"]
+        EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS = [
+            "RwkvForCausalLM",
+            "XGLMForCausalLM",
+            "GPTNeoXForCausalLM",
+            "FuyuForCausalLM",
+        ]
         if (
             tokenizer.model_max_length < 10000
             and text_generator.model.__class__.__name__ not in EXTRA_MODELS_CAN_HANDLE_LONG_INPUTS
diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py
index f0854e42553e16..8f9cbd91aad773 100644
--- a/tests/quantization/autoawq/test_awq.py
+++ b/tests/quantization/autoawq/test_awq.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import tempfile
 import unittest
 
@@ -107,6 +108,11 @@ def setUpClass(cls):
             device_map=cls.device_map,
         )
 
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
     def test_quantized_model_conversion(self):
         """
         Simple test that checks if the quantized model has been converted properly
@@ -158,6 +164,13 @@ def test_quantized_model(self):
         output = self.quantized_model.generate(**input_ids, max_new_tokens=40)
         self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
+    def test_raise_if_non_quantized(self):
+        model_id = "facebook/opt-125m"
+        quantization_config = AwqConfig(bits=4)
+
+        with self.assertRaises(ValueError):
+            _ = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config)
+
     def test_quantized_model_bf16(self):
         """
         Simple test that checks if the quantized model is working properly with bf16
@@ -195,22 +208,6 @@ def test_save_pretrained(self):
             output = model.generate(**input_ids, max_new_tokens=40)
             self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
 
-    def test_raise_quantization(self):
-        """
-        Simple test that checks if one passes a quantization config to quantize a model, it raises an error
-        """
-        quantization_config = AwqConfig(bits=4)
-
-        with self.assertRaises(ValueError) as context:
-            _ = AutoModelForCausalLM.from_pretrained(
-                self.dummy_transformers_model_name, quantization_config=quantization_config
-            )
-
-        self.assertEqual(
-            str(context.exception),
-            "You cannot pass an `AwqConfig` when loading a model as you can only use AWQ models for inference. To quantize transformers models with AWQ algorithm, please refer to our quantization docs: https://huggingface.co/docs/transformers/main_classes/quantization ",
-        )
-
     @require_torch_multi_gpu
     def test_quantized_model_multi_gpu(self):
         """
@@ -225,3 +222,144 @@ def test_quantized_model_multi_gpu(self):
         output = quantized_model.generate(**input_ids, max_new_tokens=40)
 
         self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT)
+
+
+@slow
+@require_torch_gpu
+@require_auto_awq
+@require_accelerate
+class AwqFusedTest(unittest.TestCase):
+    model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ"
+    model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510"
+
+    custom_mapping_model_id = "TheBloke/Yi-34B-AWQ"
+    custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589"
+
+    prompt = (
+        "You're standing on the surface of the Earth. "
+        "You walk one mile south, one mile west and one mile north. "
+        "You end up exactly where you started. Where are you?"
+    )
+
+    EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for"
+    EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org"
+
+    def tearDown(self):
+        gc.collect()
+        torch.cuda.empty_cache()
+        gc.collect()
+
+    def _check_fused_modules(self, model):
+        has_fused_modules = False
+        fused_modules_name = ["QuantAttentionFused", "QuantFusedMLP", "FasterTransformerRMSNorm"]
+
+        for _, module in model.named_modules():
+            if module.__class__.__name__ in fused_modules_name:
+                has_fused_modules = True
+                break
+
+        self.assertTrue(has_fused_modules, "Modules fusing not performed correctly!")
+
+    def test_raise_save_pretrained(self):
+        """
+        Test that `save_pretrained` is effectively blocked for fused models
+        """
+        quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            quantization_config=quantization_config,
+            low_cpu_mem_usage=True,
+            revision=self.model_revision,
+        ).to(torch_device)
+
+        self._check_fused_modules(model)
+
+        with self.assertRaises(ValueError), tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+
+    def test_generation_fused(self):
+        """
+        Test generation quality for fused models - single batch case
+        """
+        quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            quantization_config=quantization_config,
+            low_cpu_mem_usage=True,
+            revision=self.model_revision,
+        ).to(torch_device)
+
+        self._check_fused_modules(model)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
+
+        inputs = tokenizer(self.prompt, return_tensors="pt").to(torch_device)
+
+        outputs = model.generate(**inputs, max_new_tokens=12)
+
+        self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
+
+    def test_generation_fused_batched(self):
+        """
+        Test generation quality for fused models - multi batch case
+        """
+        quantization_config = AwqConfig(bits=4, fuse_max_seq_len=128, do_fuse=True)
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            quantization_config=quantization_config,
+            low_cpu_mem_usage=True,
+            revision=self.model_revision,
+        ).to(torch_device)
+
+        self._check_fused_modules(model)
+
+        tokenizer = AutoTokenizer.from_pretrained(self.model_name, revision=self.model_revision)
+
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+        inputs = tokenizer([self.prompt, self.prompt], return_tensors="pt", padding=True).to(torch_device)
+
+        outputs = model.generate(**inputs, max_new_tokens=12)
+
+        self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION)
+
+    @require_torch_multi_gpu
+    def test_generation_custom_model(self):
+        """
+        Test generation quality for fused models using custom fused map.
+        """
+        quantization_config = AwqConfig(
+            bits=4,
+            fuse_max_seq_len=512,
+            modules_to_fuse={
+                "attention": ["q_proj", "k_proj", "v_proj", "o_proj"],
+                "layernorm": ["ln1", "ln2", "norm"],
+                "mlp": ["gate_proj", "up_proj", "down_proj"],
+                "use_alibi": False,
+                "num_attention_heads": 56,
+                "num_key_value_heads": 8,
+                "hidden_size": 7168,
+            },
+        )
+
+        model = AutoModelForCausalLM.from_pretrained(
+            self.custom_mapping_model_id,
+            quantization_config=quantization_config,
+            trust_remote_code=True,
+            device_map="balanced",
+            revision=self.custom_model_revision,
+        )
+
+        self._check_fused_modules(model)
+
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True
+        )
+
+        prompt = "Hello"
+        inputs = tokenizer(prompt, return_tensors="pt").to(torch_device)
+
+        outputs = model.generate(**inputs, max_new_tokens=12)
+        self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL)
diff --git a/tests/repo_utils/test_check_copies.py b/tests/repo_utils/test_check_copies.py
index e3e8e47a873f72..6afed02895a9b3 100644
--- a/tests/repo_utils/test_check_copies.py
+++ b/tests/repo_utils/test_check_copies.py
@@ -95,13 +95,156 @@ def forward(self, x):
 """
 
 
+MOCK_DUMMY_BERT_CODE_MATCH = """
+class BertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_to_be_ignored(self, c):
+        return 9
+"""
+
+
+MOCK_DUMMY_ROBERTA_CODE_MATCH = """
+# Copied from transformers.models.dummy_bert_match.modeling_dummy_bert_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def existing_common(self, c):
+        return 4
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+"""
+
+
+MOCK_DUMMY_BERT_CODE_NO_MATCH = """
+class BertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_bert(self, c):
+        return 7
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 8
+
+    def existing_diff_to_be_ignored(self, c):
+        return 9
+"""
+
+
+MOCK_DUMMY_ROBERTA_CODE_NO_MATCH = """
+# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+
+    attr_1 = 1
+    attr_2 = 3
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_roberta_not_ignored(self, c):
+        return 2
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 5
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+"""
+
+
+EXPECTED_REPLACED_CODE = """
+# Copied from transformers.models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel with BertDummy->RobertaBertDummy
+class RobertaBertDummyModel:
+    attr_1 = 1
+    attr_2 = 2
+
+    def __init__(self, a=1, b=2):
+        self.a = a
+        self.b = b
+
+    # Copied from transformers.models.dummy_gpt2.modeling_dummy_gpt2.GPT2DummyModel.forward
+    def forward(self, c):
+        return 1
+
+    def only_in_bert(self, c):
+        return 7
+
+    def existing_common(self, c):
+        return 4
+
+    def existing_diff_not_ignored(self, c):
+        return 8
+
+    # Ignore copy
+    def existing_diff_to_be_ignored(self, c):
+        return 6
+
+    # Ignore copy
+    def only_in_roberta_to_be_ignored(self, c):
+        return 3
+"""
+
+
 def replace_in_file(filename, old, new):
     with open(filename, "r", encoding="utf-8") as f:
         content = f.read()
 
     content = content.replace(old, new)
 
-    with open(filename, "w", encoding="utf-8") as f:
+    with open(filename, "w", encoding="utf-8", newline="\n") as f:
         f.write(content)
 
 
@@ -117,11 +260,18 @@ def create_tmp_repo(tmp_dir):
     model_dir = tmp_dir / "src" / "transformers" / "models"
     model_dir.mkdir(parents=True, exist_ok=True)
 
-    models = {"bert": MOCK_BERT_CODE, "bertcopy": MOCK_BERT_COPY_CODE}
+    models = {
+        "bert": MOCK_BERT_CODE,
+        "bertcopy": MOCK_BERT_COPY_CODE,
+        "dummy_bert_match": MOCK_DUMMY_BERT_CODE_MATCH,
+        "dummy_roberta_match": MOCK_DUMMY_ROBERTA_CODE_MATCH,
+        "dummy_bert_no_match": MOCK_DUMMY_BERT_CODE_NO_MATCH,
+        "dummy_roberta_no_match": MOCK_DUMMY_ROBERTA_CODE_NO_MATCH,
+    }
     for model, code in models.items():
         model_subdir = model_dir / model
         model_subdir.mkdir(exist_ok=True)
-        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8") as f:
+        with open(model_subdir / f"modeling_{model}.py", "w", encoding="utf-8", newline="\n") as f:
             f.write(code)
 
 
@@ -176,11 +326,47 @@ def test_is_copy_consistent(self):
                 diffs = is_copy_consistent(file_to_check)
                 self.assertEqual(diffs, [["models.bert.modeling_bert.BertModel", 22]])
 
-                diffs = is_copy_consistent(file_to_check, overwrite=True)
+                _ = is_copy_consistent(file_to_check, overwrite=True)
 
                 with open(file_to_check, "r", encoding="utf-8") as f:
                     self.assertEqual(f.read(), MOCK_BERT_COPY_CODE)
 
+    def test_is_copy_consistent_with_ignored_match(self):
+        path_to_check = ["src", "transformers", "models", "dummy_roberta_match", "modeling_dummy_roberta_match.py"]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+                diffs = is_copy_consistent(file_to_check)
+                self.assertEqual(diffs, [])
+
+    def test_is_copy_consistent_with_ignored_no_match(self):
+        path_to_check = [
+            "src",
+            "transformers",
+            "models",
+            "dummy_roberta_no_match",
+            "modeling_dummy_roberta_no_match.py",
+        ]
+        with tempfile.TemporaryDirectory() as tmp_folder:
+            # Base check with an inconsistency
+            create_tmp_repo(tmp_folder)
+            with patch_transformer_repo_path(tmp_folder):
+                file_to_check = os.path.join(tmp_folder, *path_to_check)
+
+                diffs = is_copy_consistent(file_to_check)
+                # line 6: `attr_2 = 3` in `MOCK_DUMMY_ROBERTA_CODE_NO_MATCH`.
+                # (which has a leading `\n`.)
+                self.assertEqual(
+                    diffs, [["models.dummy_bert_no_match.modeling_dummy_bert_no_match.BertDummyModel", 6]]
+                )
+
+                _ = is_copy_consistent(file_to_check, overwrite=True)
+
+                with open(file_to_check, "r", encoding="utf-8") as f:
+                    self.assertEqual(f.read(), EXPECTED_REPLACED_CODE)
+
     def test_convert_to_localized_md(self):
         localized_readme = check_copies.LOCALIZED_READMES["README_zh-hans.md"]
 
diff --git a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
index c38ee542e6ce8a..fd8b36fc9ab208 100644
--- a/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
+++ b/tests/sagemaker/scripts/pytorch/run_glue_model_parallelism.py
@@ -282,7 +282,7 @@ def main():
             # Loading a dataset from local json files
             datasets = load_dataset("json", data_files=data_files)
     # See more about loading any type of standard or custom dataset at
-    # https://huggingface.co/docs/datasets/loading_datasets.html.
+    # https://huggingface.co/docs/datasets/loading_datasets.
 
     # Labels
     if data_args.task_name is not None:
@@ -299,7 +299,7 @@ def main():
             num_labels = 1
         else:
             # A useful fast method:
-            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
+            # https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.unique
             label_list = datasets["train"].unique("label")
             label_list.sort()  # Let's sort it for determinism
             num_labels = len(label_list)
diff --git a/tests/test_cache_utils.py b/tests/test_cache_utils.py
new file mode 100644
index 00000000000000..72d055c8806afd
--- /dev/null
+++ b/tests/test_cache_utils.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from transformers import set_seed
+from transformers.testing_utils import is_torch_available, require_auto_gptq, require_torch, require_torch_gpu, slow
+
+
+if is_torch_available():
+    import torch
+
+    from transformers import AutoModelForCausalLM, AutoTokenizer, DynamicCache, LlamaForCausalLM, SinkCache
+
+
+@require_torch
+class CacheTest(unittest.TestCase):
+    def test_cache_equivalence(self):
+        """Tests that we can convert back and forth between the legacy cache format and DynamicCache"""
+        legacy_cache = ()
+        new_cache = DynamicCache()
+
+        # Creates a new cache with 10 layers in both formats
+        for layer_idx in range(10):
+            new_key = torch.rand((2, 4, 8, 16))
+            new_value = torch.rand((2, 4, 8, 16))
+            new_cache.update(new_key, new_value, layer_idx)
+            legacy_cache += ((new_key, new_value),)
+
+        # Sanity check 1: they must have the same shapes
+        self.assertTrue(len(legacy_cache), len(new_cache))
+        for layer_idx in range(10):
+            self.assertTrue(len(legacy_cache[layer_idx]), len(legacy_cache[layer_idx]))
+            for key_value_idx in range(2):
+                self.assertTrue(
+                    legacy_cache[layer_idx][key_value_idx].shape == new_cache[layer_idx][key_value_idx].shape
+                )
+
+        # Sanity check 2: we can get the sequence length in multiple ways with DynamicCache, and they return the
+        # expected value
+        self.assertTrue(legacy_cache[0][0].shape[-2] == new_cache[0][0].shape[-2] == new_cache.get_seq_length() == 8)
+
+        # Sanity check 3: they must be equal, and both support indexing
+        for layer_idx in range(10):
+            for key_value_idx in range(2):
+                self.assertTrue(
+                    torch.allclose(new_cache[layer_idx][key_value_idx], legacy_cache[layer_idx][key_value_idx])
+                )
+
+        # Test 1: We can convert from legacy to new with no changes
+        from_legacy = DynamicCache.from_legacy_cache(legacy_cache)
+        for layer_idx in range(10):
+            for key_value_idx in range(2):
+                self.assertTrue(
+                    torch.allclose(from_legacy[layer_idx][key_value_idx], legacy_cache[layer_idx][key_value_idx])
+                )
+
+        # Test 2: We can convert from new to legacy with no changes
+        to_legacy = new_cache.to_legacy_cache()
+        for layer_idx in range(10):
+            for key_value_idx in range(2):
+                self.assertTrue(
+                    torch.allclose(to_legacy[layer_idx][key_value_idx], new_cache[layer_idx][key_value_idx])
+                )
+
+    def test_reorder_cache_retrocompatibility(self):
+        """Tests that Cache.reorder_cache is retrocompatible with the legacy code path"""
+        legacy_reorder_fn = LlamaForCausalLM._reorder_cache  # An example of a legacy `_reorder_cache` function
+
+        legacy_cache = ()
+        new_cache = DynamicCache()
+
+        # Creates a new cache with 10 layers in both formats
+        for layer_idx in range(10):
+            new_key = torch.rand((4, 4, 8, 16))
+            new_value = torch.rand((4, 4, 8, 16))
+            new_cache.update(new_key, new_value, layer_idx)
+            legacy_cache += ((new_key, new_value),)
+
+        # Let's create some dummy beam indices. From the shape above, it is equivalent to the case where num_beams=4
+        # and batch_size=1
+        beam_idx = torch.randint(low=0, high=4, size=(4,))
+
+        legacy_cache_reordered = legacy_reorder_fn(legacy_cache, beam_idx)
+        new_cache.reorder_cache(beam_idx)
+
+        # Let's check that the results are the same
+        for layer_idx in range(10):
+            for key_value_idx in range(2):
+                self.assertTrue(
+                    torch.allclose(
+                        new_cache[layer_idx][key_value_idx], legacy_cache_reordered[layer_idx][key_value_idx]
+                    )
+                )
+
+
+@require_torch_gpu
+@slow
+class CacheIntegrationTest(unittest.TestCase):
+    def test_dynamic_cache_hard(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
+        model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
+        )
+        inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)
+
+        # DynamicCache and the legacy cache format should be equivalent
+        set_seed(0)
+        gen_out_legacy = model.generate(**inputs, do_sample=True, max_new_tokens=256)
+        set_seed(0)
+        gen_out = model.generate(**inputs, do_sample=True, max_new_tokens=256, past_key_values=DynamicCache())
+        self.assertListEqual(gen_out_legacy.tolist(), gen_out.tolist())
+
+        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        expected_text = (
+            "Here's everything I know about cats. Cats are mysterious creatures. They can't talk, and they don't like "
+            "to be held. They don't play fetch, and they don't like to be hugged. But they do like to be petted.\n"
+            "Cats are also very independent. They don't like to be told what to do, and they don't like to be told "
+            "what to eat. They are also very territorial. They don't like to share their food or their toys.\nCats "
+            "are also very curious. They like to explore, and they like to play. They are also very fast. They can "
+            "run very fast, and they can jump very high.\nCats are also very smart. They can learn tricks, and they "
+            "can solve problems. They are also very playful. They like to play with toys, and they like to play with "
+            "other cats.\nCats are also very affectionate. They like to be petted, and they like to be held. They "
+            "also like to be scratched.\nCats are also very clean. They like to groom themselves, and they like to "
+            "clean their litter box.\nCats are also very independent. They don't"
+        )
+        self.assertEqual(decoded[0], expected_text)
+
+    def test_dynamic_cache_batched(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
+        tokenizer.pad_token = tokenizer.eos_token
+        model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
+        )
+        inputs = tokenizer(["A sequence: 1, 2, 3, 4, 5", "A sequence: A, B, C"], padding=True, return_tensors="pt").to(
+            model.device
+        )
+
+        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=10, past_key_values=DynamicCache())
+        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        expected_text = ["A sequence: 1, 2, 3, 4, 5, 6, 7, 8,", "A sequence: A, B, C, D, E, F, G, H"]
+        self.assertListEqual(decoded, expected_text)
+
+    def test_dynamic_cache_beam_search(self):
+        tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", padding_side="left")
+        model = AutoModelForCausalLM.from_pretrained(
+            "meta-llama/Llama-2-7b-hf", device_map="auto", torch_dtype=torch.float16
+        )
+
+        inputs = tokenizer(["The best color is"], return_tensors="pt").to(model.device)
+        gen_out = model.generate(
+            **inputs,
+            do_sample=False,
+            max_new_tokens=20,
+            num_beams=2,
+            num_return_sequences=2,
+        )
+        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        expected_text = [
+            "The best color is the one that makes you feel good.\nThe best color is the one that makes you feel good",
+            "The best color is the one that suits you.\nThe best color is the one that suits you. The",
+        ]
+        self.assertListEqual(decoded, expected_text)
+
+    @require_auto_gptq
+    def test_sink_cache_hard(self):
+        tokenizer = AutoTokenizer.from_pretrained("TheBloke/LLaMa-7B-GPTQ")
+        model = AutoModelForCausalLM.from_pretrained("TheBloke/LLaMa-7B-GPTQ", device_map="auto")
+
+        inputs = tokenizer(["Vaswani et al. (2017) introduced the Transformers"], return_tensors="pt").to(model.device)
+
+        # Set up the SinkCache. Using a small window length to contain computational complexity. If this example is run
+        # without a SinkCache, the last few tokens are gibberish (ends in "of the of the of a of a of")
+        cache = SinkCache(window_length=508, num_sink_tokens=4)
+        gen_out = model.generate(**inputs, do_sample=False, max_new_tokens=3000, past_key_values=cache)
+        decoded = tokenizer.batch_decode(gen_out, skip_special_tokens=True)
+        self.assertTrue(decoded[0].endswith("to perform a variety of tasks. The Transformer is a neural network"))
+
+    def test_sink_cache_iterative_prompts(self):
+        """Tests that SinkCache supports more than one new token at once, when shifting the cache"""
+        tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
+        model = AutoModelForCausalLM.from_pretrained(
+            "HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.float16
+        )
+        prompt = (
+            "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences "
+            "and must-see attractions."
+        )
+
+        # Prepare generation settings
+        cache = SinkCache(window_length=256, num_sink_tokens=4)
+        input_ids = torch.tensor([], device=model.device, dtype=torch.int)
+        for _ in range(3):
+            # Tokenize the prompt with the correct chat template
+            chat = [{"role": "user", "content": prompt}]
+            tokenized_chat = tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
+                model.device
+            )
+            input_ids = torch.cat((input_ids, tokenized_chat), dim=1)
+
+            # Perform the generation
+            gen_out = model.generate(
+                input_ids, do_sample=False, max_new_tokens=100, past_key_values=cache, use_cache=True
+            )
+            input_ids = gen_out
+
+        # We went well beyond the cache length
+        self.assertTrue(input_ids.shape[1] > cache.get_max_length() * 1.5)
+
+        # And it still produces a coherent english
+        decoded = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
+        last_output = (
+            "<|assistant|>\nAs the sun began to set over the Pacific Ocean, I found myself standing on the shores of "
+            "Waikiki Beach, my heart filled with awe and wonder. I had just returned from a two-week journey to the "
+            "beautiful island of Hawaii, and it had been an unforgettable experience filled with cultural experiences "
+            "and must-see attractions that left me breathless.\n\nOne of the most memorable experiences of my trip "
+            "was visiting the historic district of Honolulu. Here,"
+        )
+        self.assertTrue(decoded[0].endswith(last_output))
diff --git a/tests/test_configuration_utils.py b/tests/test_configuration_utils.py
index a6e9e6b0390abe..2ceb1695fa96cc 100644
--- a/tests/test_configuration_utils.py
+++ b/tests/test_configuration_utils.py
@@ -198,7 +198,14 @@ def test_config_common_kwargs_is_complete(self):
         missing_keys = [key for key in base_config.__dict__ if key not in config_common_kwargs]
         # If this part of the test fails, you have arguments to addin config_common_kwargs above.
         self.assertListEqual(
-            missing_keys, ["is_encoder_decoder", "_name_or_path", "_commit_hash", "transformers_version"]
+            missing_keys,
+            [
+                "is_encoder_decoder",
+                "_name_or_path",
+                "_commit_hash",
+                "_attn_implementation_internal",
+                "transformers_version",
+            ],
         )
         keys_with_defaults = [key for key, value in config_common_kwargs.items() if value == getattr(base_config, key)]
         if len(keys_with_defaults) > 0:
diff --git a/tests/test_image_processing_common.py b/tests/test_image_processing_common.py
index cb78b33375568e..dcbee270f90b6e 100644
--- a/tests/test_image_processing_common.py
+++ b/tests/test_image_processing_common.py
@@ -15,8 +15,11 @@
 
 import json
 import os
+import pathlib
 import tempfile
 
+from transformers import BatchFeature
+from transformers.image_utils import AnnotationFormat, AnnotionFormat
 from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_vision
 from transformers.utils import is_torch_available, is_vision_available
 
@@ -285,3 +288,81 @@ def test_call_numpy_4_channels(self):
         self.assertEqual(
             tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape)
         )
+
+
+class AnnotationFormatTestMixin:
+    # this mixin adds a test to assert that usages of the
+    # to-be-deprecated `AnnotionFormat` continue to be
+    # supported for the time being
+
+    def test_processor_can_use_legacy_annotation_format(self):
+        image_processor_dict = self.image_processor_tester.prepare_image_processor_dict()
+        fixtures_path = pathlib.Path(__file__).parent / "fixtures" / "tests_samples" / "COCO"
+
+        with open(fixtures_path / "coco_annotations.txt", "r") as f:
+            detection_target = json.loads(f.read())
+
+        detection_annotations = {"image_id": 39769, "annotations": detection_target}
+
+        detection_params = {
+            "images": Image.open(fixtures_path / "000000039769.png"),
+            "annotations": detection_annotations,
+            "return_tensors": "pt",
+        }
+
+        with open(fixtures_path / "coco_panoptic_annotations.txt", "r") as f:
+            panoptic_target = json.loads(f.read())
+
+        panoptic_annotations = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": panoptic_target}
+
+        masks_path = pathlib.Path(fixtures_path / "coco_panoptic")
+
+        panoptic_params = {
+            "images": Image.open(fixtures_path / "000000039769.png"),
+            "annotations": panoptic_annotations,
+            "return_tensors": "pt",
+            "masks_path": masks_path,
+        }
+
+        test_cases = [
+            ("coco_detection", detection_params),
+            ("coco_panoptic", panoptic_params),
+            (AnnotionFormat.COCO_DETECTION, detection_params),
+            (AnnotionFormat.COCO_PANOPTIC, panoptic_params),
+            (AnnotationFormat.COCO_DETECTION, detection_params),
+            (AnnotationFormat.COCO_PANOPTIC, panoptic_params),
+        ]
+
+        def _compare(a, b) -> None:
+            if isinstance(a, (dict, BatchFeature)):
+                self.assertEqual(a.keys(), b.keys())
+                for k, v in a.items():
+                    _compare(v, b[k])
+            elif isinstance(a, list):
+                self.assertEqual(len(a), len(b))
+                for idx in range(len(a)):
+                    _compare(a[idx], b[idx])
+            elif isinstance(a, torch.Tensor):
+                self.assertTrue(torch.allclose(a, b, atol=1e-3))
+            elif isinstance(a, str):
+                self.assertEqual(a, b)
+
+        for annotation_format, params in test_cases:
+            with self.subTest(annotation_format):
+                image_processor_params = {**image_processor_dict, **{"format": annotation_format}}
+                image_processor_first = self.image_processing_class(**image_processor_params)
+
+                with tempfile.TemporaryDirectory() as tmpdirname:
+                    image_processor_first.save_pretrained(tmpdirname)
+                    image_processor_second = self.image_processing_class.from_pretrained(tmpdirname)
+
+                # check the 'format' key exists and that the dicts of the
+                # first and second processors are equal
+                self.assertIn("format", image_processor_first.to_dict().keys())
+                self.assertEqual(image_processor_second.to_dict(), image_processor_first.to_dict())
+
+                # perform encoding using both processors and compare
+                # the resulting BatchFeatures
+                first_encoding = image_processor_first(**params)
+                second_encoding = image_processor_second(**params)
+                _compare(first_encoding, second_encoding)
diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py
index 0edc23c7af2048..85e69300516164 100755
--- a/tests/test_modeling_common.py
+++ b/tests/test_modeling_common.py
@@ -12,7 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import collections
 import copy
 import gc
@@ -28,15 +27,19 @@
 from typing import Dict, List, Tuple
 
 import numpy as np
+from parameterized import parameterized
 from pytest import mark
 
 import transformers
 from transformers import (
     AutoModel,
+    AutoModelForCausalLM,
     AutoModelForSequenceClassification,
     PretrainedConfig,
+    PreTrainedModel,
     is_torch_available,
     logging,
+    set_seed,
 )
 from transformers.models.auto import get_values
 from transformers.models.auto.modeling_auto import (
@@ -70,6 +73,7 @@
     require_torch,
     require_torch_gpu,
     require_torch_multi_gpu,
+    require_torch_sdpa,
     slow,
     torch_device,
 )
@@ -81,8 +85,9 @@
     is_flax_available,
     is_tf_available,
     is_torch_fx_available,
+    is_torch_sdpa_available,
 )
-from transformers.utils.generic import ModelOutput
+from transformers.utils.generic import ContextManagers, ModelOutput
 
 
 if is_accelerate_available():
@@ -96,6 +101,7 @@
     from torch import nn
 
     from transformers import MODEL_MAPPING, AdaptiveEmbedding
+    from transformers.modeling_utils import no_init_weights
     from transformers.pytorch_utils import id_tensor_storage
 
 
@@ -425,6 +431,56 @@ class CopyClass(model_class):
                         max_diff = (model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]).sum().item()
                     self.assertLessEqual(max_diff, 1e-3, msg=f"{key} not identical")
 
+    def test_fast_init_context_manager(self):
+        # 1. Create a dummy class. Should have buffers as well? To make sure we test __init__
+        class MyClass(PreTrainedModel):
+            config_class = PretrainedConfig
+
+            def __init__(self, config=None):
+                super().__init__(config if config is not None else PretrainedConfig())
+                self.linear = nn.Linear(10, 10, bias=True)
+                self.embedding = nn.Embedding(10, 10)
+                self.std = 1
+
+            def _init_weights(self, module):
+                if isinstance(module, nn.Linear):
+                    module.weight.data = nn.init.kaiming_uniform_(module.weight.data, np.sqrt(5))
+                    if module.bias is not None:
+                        module.bias.data.normal_(mean=0.0, std=self.std)
+
+        # 2. Make sure a linear layer's reset params is properly skipped:
+        with ContextManagers([no_init_weights(True)]):
+            no_init_instance = MyClass()
+
+        set_seed(0)
+        expected_bias = torch.tensor(
+            ([0.2975, 0.2131, -0.1379, -0.0796, -0.3012, -0.0057, -0.2381, -0.2439, -0.0174, 0.0475])
+        )
+        init_instance = MyClass()
+        torch.testing.assert_allclose(init_instance.linear.bias, expected_bias, rtol=1e-3, atol=1e-4)
+
+        set_seed(0)
+        torch.testing.assert_allclose(
+            init_instance.linear.weight, nn.init.kaiming_uniform_(no_init_instance.linear.weight, np.sqrt(5))
+        )
+
+        # 3. Make sure weights that are not present use init_weight_ and get expected values
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            state_dict = init_instance.state_dict()
+            del state_dict["linear.weight"]
+
+            init_instance.config.save_pretrained(tmpdirname)
+            torch.save(state_dict, os.path.join(tmpdirname, "pytorch_model.bin"))
+            set_seed(0)
+            model_fast_init = MyClass.from_pretrained(tmpdirname)
+
+            set_seed(0)
+            model_slow_init = MyClass.from_pretrained(tmpdirname, _fast_init=False)
+
+            for key in model_fast_init.state_dict().keys():
+                max_diff = torch.max(torch.abs(model_slow_init.state_dict()[key] - model_fast_init.state_dict()[key]))
+                self.assertLessEqual(max_diff.item(), 1e-3, msg=f"{key} not identical")
+
     def test_save_load_fast_init_to_base(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
         if config.__class__ not in MODEL_MAPPING:
@@ -541,8 +597,14 @@ def test_forward_signature(self):
                     else ["encoder_outputs"]
                 )
                 self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
+            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and self.has_attentions:
+                expected_arg_names = ["pixel_values", "output_hidden_states", "output_attentions", "return_dict"]
+                self.assertListEqual(arg_names, expected_arg_names)
+            elif model_class.__name__ in [*get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)] and not self.has_attentions:
+                expected_arg_names = ["pixel_values", "output_hidden_states", "return_dict"]
+                self.assertListEqual(arg_names, expected_arg_names)
             else:
-                expected_arg_names = ["input_ids"]
+                expected_arg_names = [model.main_input_name]
                 self.assertListEqual(arg_names[:1], expected_arg_names)
 
     def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=None):
@@ -550,10 +612,6 @@ def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=No
             return
 
         for model_class in self.all_model_classes:
-            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            config.use_cache = False
-            config.return_dict = True
-
             if (
                 model_class.__name__
                 in [*get_values(MODEL_MAPPING_NAMES), *get_values(MODEL_FOR_BACKBONE_MAPPING_NAMES)]
@@ -562,6 +620,8 @@ def check_training_gradient_checkpointing(self, gradient_checkpointing_kwargs=No
                 continue
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            config.use_cache = False
+            config.return_dict = True
             model = model_class(config)
 
             model.to(torch_device)
@@ -771,102 +831,120 @@ def _create_and_check_torchscript(self, config, inputs_dict):
         configs_no_init = _config_zero_init(config)  # To be sure we have no Nan
         configs_no_init.torchscript = True
         for model_class in self.all_model_classes:
-            model = model_class(config=configs_no_init)
-            model.to(torch_device)
-            model.eval()
-            inputs = self._prepare_for_class(inputs_dict, model_class)
-
-            main_input_name = model_class.main_input_name
-
-            try:
-                if model.config.is_encoder_decoder:
-                    model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
-                    main_input = inputs[main_input_name]
-                    attention_mask = inputs["attention_mask"]
-                    decoder_input_ids = inputs["decoder_input_ids"]
-                    decoder_attention_mask = inputs["decoder_attention_mask"]
-                    model(main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    traced_model = torch.jit.trace(
-                        model, (main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
-                    )
-                elif "bbox" in inputs and "image" in inputs:  # LayoutLMv2 requires additional inputs
-                    input_ids = inputs["input_ids"]
-                    bbox = inputs["bbox"]
-                    image = inputs["image"].tensor
-                    model(input_ids, bbox, image)
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, bbox, image), check_trace=False
-                    )  # when traced model is checked, an error is produced due to name mangling
-                elif "bbox" in inputs:  # Bros requires additional inputs (bbox)
-                    input_ids = inputs["input_ids"]
-                    bbox = inputs["bbox"]
-                    model(input_ids, bbox)
-                    traced_model = torch.jit.trace(
-                        model, (input_ids, bbox), check_trace=False
-                    )  # when traced model is checked, an error is produced due to name mangling
-                else:
-                    main_input = inputs[main_input_name]
-                    model(main_input)
-                    traced_model = torch.jit.trace(model, main_input)
-            except RuntimeError:
-                self.fail("Couldn't trace module.")
+            for attn_implementation in ["eager", "sdpa"]:
+                if attn_implementation == "sdpa" and (not model_class._supports_sdpa or not is_torch_sdpa_available()):
+                    continue
 
-            with tempfile.TemporaryDirectory() as tmp_dir_name:
-                pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+                configs_no_init._attn_implementation = attn_implementation
+                model = model_class(config=configs_no_init)
+                model.to(torch_device)
+                model.eval()
+                inputs = self._prepare_for_class(inputs_dict, model_class)
 
-                try:
-                    torch.jit.save(traced_model, pt_file_name)
-                except Exception:
-                    self.fail("Couldn't save module.")
+                main_input_name = model_class.main_input_name
 
                 try:
-                    loaded_model = torch.jit.load(pt_file_name)
-                except Exception:
-                    self.fail("Couldn't load module.")
+                    if model.config.is_encoder_decoder:
+                        model.config.use_cache = False  # FSTM still requires this hack -> FSTM should probably be refactored similar to BART afterward
+                        main_input = inputs[main_input_name]
+                        attention_mask = inputs["attention_mask"]
+                        decoder_input_ids = inputs["decoder_input_ids"]
+                        decoder_attention_mask = inputs["decoder_attention_mask"]
+                        model(main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        traced_model = torch.jit.trace(
+                            model, (main_input, attention_mask, decoder_input_ids, decoder_attention_mask)
+                        )
+                    elif "bbox" in inputs and "image" in inputs:  # LayoutLMv2 requires additional inputs
+                        input_ids = inputs["input_ids"]
+                        bbox = inputs["bbox"]
+                        image = inputs["image"].tensor
+                        model(input_ids, bbox, image)
+                        traced_model = torch.jit.trace(
+                            model, (input_ids, bbox, image), check_trace=False
+                        )  # when traced model is checked, an error is produced due to name mangling
+                    elif "bbox" in inputs:  # Bros requires additional inputs (bbox)
+                        input_ids = inputs["input_ids"]
+                        bbox = inputs["bbox"]
+                        model(input_ids, bbox)
+                        traced_model = torch.jit.trace(
+                            model, (input_ids, bbox), check_trace=False
+                        )  # when traced model is checked, an error is produced due to name mangling
+                    else:
+                        main_input = inputs[main_input_name]
+
+                        if model.config._attn_implementation == "sdpa":
+                            trace_input = {main_input_name: main_input}
+
+                            if "attention_mask" in inputs:
+                                trace_input["attention_mask"] = inputs["attention_mask"]
+                            else:
+                                self.skipTest("testing SDPA without attention_mask is not supported")
+
+                            model(main_input, attention_mask=inputs["attention_mask"])
+                            # example_kwarg_inputs was introduced in torch==2.0, but it is fine here since SDPA has a requirement on torch>=2.1.
+                            traced_model = torch.jit.trace(model, example_kwarg_inputs=trace_input)
+                        else:
+                            model(main_input)
+                            traced_model = torch.jit.trace(model, (main_input,))
+                except RuntimeError:
+                    self.fail("Couldn't trace module.")
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt")
+
+                    try:
+                        torch.jit.save(traced_model, pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't save module.")
+
+                    try:
+                        loaded_model = torch.jit.load(pt_file_name)
+                    except Exception:
+                        self.fail("Couldn't load module.")
 
-            model.to(torch_device)
-            model.eval()
+                model.to(torch_device)
+                model.eval()
 
-            loaded_model.to(torch_device)
-            loaded_model.eval()
+                loaded_model.to(torch_device)
+                loaded_model.eval()
 
-            model_state_dict = model.state_dict()
-            loaded_model_state_dict = loaded_model.state_dict()
+                model_state_dict = model.state_dict()
+                loaded_model_state_dict = loaded_model.state_dict()
 
-            non_persistent_buffers = {}
-            for key in loaded_model_state_dict.keys():
-                if key not in model_state_dict.keys():
-                    non_persistent_buffers[key] = loaded_model_state_dict[key]
+                non_persistent_buffers = {}
+                for key in loaded_model_state_dict.keys():
+                    if key not in model_state_dict.keys():
+                        non_persistent_buffers[key] = loaded_model_state_dict[key]
 
-            loaded_model_state_dict = {
-                key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
-            }
+                loaded_model_state_dict = {
+                    key: value for key, value in loaded_model_state_dict.items() if key not in non_persistent_buffers
+                }
 
-            self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
+                self.assertEqual(set(model_state_dict.keys()), set(loaded_model_state_dict.keys()))
 
-            model_buffers = list(model.buffers())
-            for non_persistent_buffer in non_persistent_buffers.values():
-                found_buffer = False
-                for i, model_buffer in enumerate(model_buffers):
-                    if torch.equal(non_persistent_buffer, model_buffer):
-                        found_buffer = True
-                        break
+                model_buffers = list(model.buffers())
+                for non_persistent_buffer in non_persistent_buffers.values():
+                    found_buffer = False
+                    for i, model_buffer in enumerate(model_buffers):
+                        if torch.equal(non_persistent_buffer, model_buffer):
+                            found_buffer = True
+                            break
 
-                self.assertTrue(found_buffer)
-                model_buffers.pop(i)
+                    self.assertTrue(found_buffer)
+                    model_buffers.pop(i)
 
-            models_equal = True
-            for layer_name, p1 in model_state_dict.items():
-                if layer_name in loaded_model_state_dict:
-                    p2 = loaded_model_state_dict[layer_name]
-                    if p1.data.ne(p2.data).sum() > 0:
-                        models_equal = False
+                models_equal = True
+                for layer_name, p1 in model_state_dict.items():
+                    if layer_name in loaded_model_state_dict:
+                        p2 = loaded_model_state_dict[layer_name]
+                        if p1.data.ne(p2.data).sum() > 0:
+                            models_equal = False
 
-            self.assertTrue(models_equal)
+                self.assertTrue(models_equal)
 
-            # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
-            # (Even with this call, there are still memory leak by ~0.04MB)
-            self.clear_torch_jit_class_registry()
+                # Avoid memory leak. Without this, each call increase RAM usage by ~20MB.
+                # (Even with this call, there are still memory leak by ~0.04MB)
+                self.clear_torch_jit_class_registry()
 
     def test_torch_fx(self):
         config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -1926,7 +2004,6 @@ def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_cla
             "FunnelForPreTraining",
             "ElectraForPreTraining",
             "XLMWithLMHeadModel",
-            "TransfoXLLMHeadModel",
         ]:
             for k in key_differences:
                 if k in ["loss", "losses"]:
@@ -2828,20 +2905,18 @@ def test_model_is_small(self):
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_conversion(self):
-        import torch
-
         config, _ = self.model_tester.prepare_config_and_inputs_for_common()
 
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2"
                 ).to(torch_device)
 
                 for _, module in model.named_modules():
@@ -2855,11 +2930,9 @@ def test_flash_attn_2_conversion(self):
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference(self):
-        import torch
-
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -2867,12 +2940,12 @@ def test_flash_attn_2_inference(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model.to(torch_device)
 
@@ -2952,11 +3025,9 @@ def test_flash_attn_2_inference(self):
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_inference_padding_right(self):
-        import torch
-
         for model_class in self.all_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -2964,12 +3035,12 @@ def test_flash_attn_2_inference_padding_right(self):
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
                 model_fa = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=True
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model_fa.to(torch_device)
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.bfloat16, use_flash_attention_2=False
+                    tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
                 )
                 model.to(torch_device)
 
@@ -3045,20 +3116,18 @@ def test_flash_attn_2_inference_padding_right(self):
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_left_padding(self):
-        import torch
-
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
-                ).to(torch_device)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+                    torch_device
+                )
 
                 dummy_input = inputs_dict[model.main_input_name]
                 if dummy_input.dtype in [torch.float32, torch.bfloat16]:
@@ -3074,41 +3143,42 @@ def test_flash_attn_2_generate_left_padding(self):
                 )
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
                 ).to(torch_device)
 
                 out_fa = model.generate(
                     dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
                 )
 
-                self.assertTrue(torch.equal(out, out_fa))
+                self.assertTrue(torch.allclose(out, out_fa))
 
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_padding_right(self):
-        import torch
-
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
 
             with tempfile.TemporaryDirectory() as tmpdirname:
                 model.save_pretrained(tmpdirname)
-                model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=False, low_cpu_mem_usage=True
-                ).to(torch_device)
+                model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
+                    torch_device
+                )
 
                 dummy_input = inputs_dict[model.main_input_name]
                 if dummy_input.dtype in [torch.float32, torch.bfloat16]:
                     dummy_input = dummy_input.to(torch.float16)
 
                 dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
-                # make sure we do left padding
+                # make sure we do right padding
                 dummy_attention_mask[:, :-1] = 1
                 dummy_attention_mask[:, -1:] = 0
 
@@ -3117,27 +3187,347 @@ def test_flash_attn_2_generate_padding_right(self):
                 )
 
                 model = model_class.from_pretrained(
-                    tmpdirname, torch_dtype=torch.float16, use_flash_attention_2=True, low_cpu_mem_usage=True
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    attn_implementation="flash_attention_2",
+                    low_cpu_mem_usage=True,
                 ).to(torch_device)
 
                 out_fa = model.generate(
                     dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False
                 )
 
-                self.assertTrue(torch.equal(out, out_fa))
+                self.assertTrue(torch.allclose(out, out_fa))
+
+    @parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_inference(self, torch_dtype: str):
+        if not self.all_model_classes[0]._supports_sdpa:
+            self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
+
+        if torch_device == "cpu" and torch_dtype == "float16":
+            self.skipTest("float16 not supported on cpu")
+
+        # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead.
+        if torch_dtype == "float16":
+            torch_dtype = torch.float16
+        elif torch_dtype == "bfloat16":
+            torch_dtype = torch.bfloat16
+        elif torch_dtype == "float32":
+            torch_dtype = torch.float32
+
+        atols = {
+            ("cpu", False, torch.float32): 1e-6,
+            ("cpu", False, torch.bfloat16): 1e-2,
+            ("cpu", True, torch.float32): 1e-6,
+            ("cpu", True, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float32): 1e-6,
+            ("cuda", False, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float16): 1e-3,
+            ("cuda", True, torch.float32): 1e-6,
+            ("cuda", True, torch.bfloat16): 1e-2,
+            ("cuda", True, torch.float16): 5e-3,
+        }
+        rtols = {
+            ("cpu", False, torch.float32): 1e-4,
+            ("cpu", False, torch.bfloat16): 1e-2,
+            ("cpu", True, torch.float32): 1e-4,
+            ("cpu", True, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float32): 1e-4,
+            ("cuda", False, torch.bfloat16): 1e-2,
+            ("cuda", False, torch.float16): 1e-3,
+            ("cuda", True, torch.float32): 1e-4,
+            ("cuda", True, torch.bfloat16): 3e-2,
+            ("cuda", True, torch.float16): 5e-3,
+        }
+
+        def get_mean_reldiff(failcase, x, ref, atol, rtol):
+            return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}"
+
+        for model_class in self.all_model_classes:
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+            model = model_class(config)
+
+            is_encoder_decoder = model.config.is_encoder_decoder
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+                model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
+                model_sdpa = model_sdpa.eval().to(torch_device)
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch_dtype,
+                    attn_implementation="eager",
+                )
+                model_eager = model_eager.eval().to(torch_device)
+
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+                for name, submodule in model_eager.named_modules():
+                    if "SdpaAttention" in submodule.__class__.__name__:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+                has_sdpa = False
+                for name, submodule in model_sdpa.named_modules():
+                    if "SdpaAttention" in submodule.__class__.__name__:
+                        has_sdpa = True
+                        break
+                if not has_sdpa and model_sdpa.config.model_type != "falcon":
+                    raise ValueError("The SDPA model should have SDPA attention layers")
+
+                # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
+                # but it would be nicer to have an efficient way to use parameterized.expand
+                fail_cases = []
+                for padding_side in ["left", "right"]:
+                    for use_mask in [False, True]:
+                        for batch_size in [1, 5]:
+                            dummy_input = inputs_dict[model.main_input_name]
+
+                            if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+                                dummy_input = dummy_input.to(torch_dtype)
+
+                            dummy_input = dummy_input[:batch_size]
+                            if dummy_input.shape[0] != batch_size:
+                                if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]:
+                                    extension = torch.rand(
+                                        batch_size - dummy_input.shape[0],
+                                        *dummy_input.shape[1:],
+                                        dtype=torch_dtype,
+                                        device=torch_device,
+                                    )
+                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+                                else:
+                                    extension = torch.randint(
+                                        high=5,
+                                        size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]),
+                                        dtype=dummy_input.dtype,
+                                        device=torch_device,
+                                    )
+                                    dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device)
+
+                            if not use_mask:
+                                dummy_attention_mask = None
+                            else:
+                                dummy_attention_mask = inputs_dict.get("attention_mask", None)
+                                if dummy_attention_mask is None:
+                                    if is_encoder_decoder:
+                                        seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1]
+                                    else:
+                                        seqlen = dummy_input.shape[-1]
+                                    dummy_attention_mask = (
+                                        torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device)
+                                    )
+
+                                dummy_attention_mask = dummy_attention_mask[:batch_size]
+                                if dummy_attention_mask.shape[0] != batch_size:
+                                    extension = torch.ones(
+                                        batch_size - dummy_attention_mask.shape[0],
+                                        *dummy_attention_mask.shape[1:],
+                                        dtype=dummy_attention_mask.dtype,
+                                        device=torch_device,
+                                    )
+                                    dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0)
+                                    dummy_attention_mask = dummy_attention_mask.to(torch_device)
+
+                                dummy_attention_mask[:] = 1
+                                if padding_side == "left":
+                                    dummy_attention_mask[-1, :-1] = 1
+                                    dummy_attention_mask[-1, -4:] = 0
+                                elif padding_side == "right":
+                                    dummy_attention_mask[-1, 1:] = 1
+                                    dummy_attention_mask[-1, :3] = 0
+
+                            for enable_kernels in [False, True]:
+                                failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}"
+                                if is_encoder_decoder:
+                                    decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[:batch_size]
+                                    if decoder_input_ids.shape[0] != batch_size:
+                                        extension = torch.ones(
+                                            batch_size - decoder_input_ids.shape[0],
+                                            *decoder_input_ids.shape[1:],
+                                            dtype=decoder_input_ids.dtype,
+                                            device=torch_device,
+                                        )
+                                        decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0)
+                                        decoder_input_ids = decoder_input_ids.to(torch_device)
+
+                                    # TODO: never an `attention_mask` arg here?
+                                    other_inputs = {
+                                        "decoder_input_ids": decoder_input_ids,
+                                        "decoder_attention_mask": dummy_attention_mask,
+                                        "output_hidden_states": True,
+                                    }
+                                else:
+                                    other_inputs = {
+                                        "output_hidden_states": True,
+                                    }
+
+                                    # Otherwise fails for e.g. WhisperEncoderModel
+                                    if "attention_mask" in inspect.signature(model_eager.forward).parameters:
+                                        other_inputs["attention_mask"] = dummy_attention_mask
+
+                                # TODO: test gradients as well (& for FA2 as well!)
+                                with torch.no_grad():
+                                    with torch.backends.cuda.sdp_kernel(
+                                        enable_flash=enable_kernels,
+                                        enable_math=True,
+                                        enable_mem_efficient=enable_kernels,
+                                    ):
+                                        outputs_eager = model_eager(dummy_input, **other_inputs)
+                                        outputs_sdpa = model_sdpa(dummy_input, **other_inputs)
+
+                                logits_eager = (
+                                    outputs_eager.hidden_states[-1]
+                                    if not is_encoder_decoder
+                                    else outputs_eager.decoder_hidden_states[-1]
+                                )
+                                logits_sdpa = (
+                                    outputs_sdpa.hidden_states[-1]
+                                    if not is_encoder_decoder
+                                    else outputs_sdpa.decoder_hidden_states[-1]
+                                )
+
+                                if torch_device in ["cpu", "cuda"]:
+                                    atol = atols[torch_device, enable_kernels, torch_dtype]
+                                    rtol = rtols[torch_device, enable_kernels, torch_dtype]
+                                else:
+                                    atol = 1e-7
+                                    rtol = 1e-4
+
+                                # Masked tokens output slightly deviates - we don't mind that.
+                                if use_mask:
+                                    if padding_side == "left":
+                                        sub_sdpa = logits_sdpa[:-1]
+                                        sub_eager = logits_eager[:-1]
+                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            fail_cases.append(
+                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                            )
+
+                                        sub_sdpa = logits_sdpa[-1, :-4]
+                                        sub_eager = logits_eager[-1, :-4]
+                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            fail_cases.append(
+                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                            )
+
+                                        # Testing the padding tokens is not really meaningful but anyway
+                                        # sub_sdpa = logits_sdpa[-1, -4:]
+                                        # sub_eager = logits_eager[-1, -4:]
+                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+                                    elif padding_side == "right":
+                                        sub_sdpa = logits_sdpa[:-1]
+                                        sub_eager = logits_eager[:-1]
+                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            fail_cases.append(
+                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                            )
+
+                                        sub_sdpa = logits_sdpa[-1, 3:]
+                                        sub_eager = logits_eager[-1, 3:]
+                                        if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                            fail_cases.append(
+                                                get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol)
+                                            )
+
+                                        # Testing the padding tokens is not really meaningful but anyway
+                                        # sub_sdpa = logits_sdpa[-1, :3]
+                                        # sub_eager = logits_eager[-1, :3]
+                                        # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol):
+                                        #     fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2))
+
+                                else:
+                                    if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol):
+                                        fail_cases.append(
+                                            get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol)
+                                        )
+
+                self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
+
+    @require_torch_sdpa
+    @slow
+    def test_eager_matches_sdpa_generate(self):
+        max_new_tokens = 30
+
+        if len(self.all_generative_model_classes) == 0:
+            self.skipTest(f"{self.__class__.__name__} tests a model that does support generate: skipping this test")
+
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_sdpa:
+                self.skipTest(f"{model_class.__name__} does not support SDPA")
+
+            config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
+
+            dummy_input = inputs_dict[model_class.main_input_name]
+            if dummy_input.dtype in [torch.float32, torch.bfloat16]:
+                dummy_input = dummy_input.to(torch.float16)
+
+            # make sure that all models have enough positions for generation
+            if hasattr(config, "max_position_embeddings"):
+                config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
+
+            model = model_class(config)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                model.save_pretrained(tmpdirname)
+
+                dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
+
+                model_sdpa = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                ).to(torch_device)
+
+                self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
+
+                model_eager = model_class.from_pretrained(
+                    tmpdirname,
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    attn_implementation="eager",
+                ).to(torch_device)
+
+                self.assertTrue(model_eager.config._attn_implementation == "eager")
+
+                for name, submodule in model_eager.named_modules():
+                    if "SdpaAttention" in submodule.__class__.__name__:
+                        raise ValueError("The eager model should not have SDPA attention layers")
+
+                has_sdpa = False
+                for name, submodule in model_sdpa.named_modules():
+                    if "SdpaAttention" in submodule.__class__.__name__:
+                        has_sdpa = True
+                        break
+                if not has_sdpa:
+                    raise ValueError("The SDPA model should have SDPA attention layers")
+
+                # Just test that a large cache works as expected
+                res_eager = model_eager.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                res_sdpa = model_sdpa.generate(
+                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                )
+
+                self.assertTrue(torch.allclose(res_eager, res_sdpa))
 
     @require_flash_attn
     @require_torch_gpu
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_generate_use_cache(self):
-        import torch
-
         max_new_tokens = 30
 
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -3159,13 +3549,17 @@ def test_flash_attn_2_generate_use_cache(self):
                 model = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch.float16,
-                    use_flash_attention_2=True,
+                    attn_implementation="flash_attention_2",
                     low_cpu_mem_usage=True,
                 ).to(torch_device)
 
                 # Just test that a large cache works as expected
                 _ = model.generate(
-                    dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
+                    dummy_input,
+                    attention_mask=dummy_attention_mask,
+                    max_new_tokens=max_new_tokens,
+                    do_sample=False,
+                    use_cache=True,
                 )
 
     @require_flash_attn
@@ -3174,11 +3568,9 @@ def test_flash_attn_2_generate_use_cache(self):
     @mark.flash_attn_test
     @slow
     def test_flash_attn_2_fp32_ln(self):
-        import torch
-
         for model_class in self.all_generative_model_classes:
             if not model_class._supports_flash_attn_2:
-                return
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
 
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
             model = model_class(config)
@@ -3196,7 +3588,7 @@ def test_flash_attn_2_fp32_ln(self):
                 model = model_class.from_pretrained(
                     tmpdirname,
                     torch_dtype=torch.float16,
-                    use_flash_attention_2=True,
+                    attn_implementation="flash_attention_2",
                     low_cpu_mem_usage=True,
                     load_in_4bit=True,
                 )
@@ -3269,6 +3661,51 @@ def test_flax_from_pt_safetensors(self):
                 # Check models are equal
                 self.assertTrue(check_models_equal(flax_model_1, flax_model_2))
 
+    @require_flash_attn
+    @require_torch_gpu
+    @mark.flash_attn_test
+    @slow
+    def test_flash_attn_2_from_config(self):
+        for model_class in self.all_generative_model_classes:
+            if not model_class._supports_flash_attn_2:
+                self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
+
+            config, _ = self.model_tester.prepare_config_and_inputs_for_common()
+            # TODO: to change it in the future with other relevant auto classes
+            fa2_model = AutoModelForCausalLM.from_config(
+                config, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16
+            ).to(torch_device)
+
+            dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device)
+            dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [0, 1, 1, 1]]).to(torch_device)
+
+            fa2_correctly_converted = False
+
+            for _, module in fa2_model.named_modules():
+                if "FlashAttention" in module.__class__.__name__:
+                    fa2_correctly_converted = True
+                    break
+
+            self.assertTrue(fa2_correctly_converted)
+
+            _ = fa2_model(input_ids=dummy_input, attention_mask=dummy_attention_mask)
+
+            with tempfile.TemporaryDirectory() as tmpdirname:
+                fa2_model.save_pretrained(tmpdirname)
+
+                model_from_pretrained = AutoModelForCausalLM.from_pretrained(tmpdirname)
+
+                self.assertTrue(model_from_pretrained.config._attn_implementation != "flash_attention_2")
+
+                fa2_correctly_converted = False
+
+                for _, module in model_from_pretrained.named_modules():
+                    if "FlashAttention" in module.__class__.__name__:
+                        fa2_correctly_converted = True
+                        break
+
+                self.assertFalse(fa2_correctly_converted)
+
 
 global_rng = random.Random()
 
diff --git a/tests/test_modeling_tf_common.py b/tests/test_modeling_tf_common.py
index d7cd62b41a025e..7ac744263cc023 100644
--- a/tests/test_modeling_tf_common.py
+++ b/tests/test_modeling_tf_common.py
@@ -465,7 +465,6 @@ def _postprocessing_to_ignore_test_cases(self, tf_outputs, pt_outputs, model_cla
             "TFFunnelForPreTraining",
             "TFElectraForPreTraining",
             "TFXLMWithLMHeadModel",
-            "TFTransfoXLLMHeadModel",
         ]:
             for k in key_differences:
                 if k in ["loss", "losses"]:
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 62a639a95f5695..ddfaad5214dc50 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -21,9 +21,11 @@
 import tempfile
 import unittest
 import unittest.mock as mock
+import uuid
 from pathlib import Path
 
-from huggingface_hub import HfFolder, delete_repo
+import requests
+from huggingface_hub import HfApi, HfFolder, delete_repo
 from huggingface_hub.file_download import http_get
 from pytest import mark
 from requests.exceptions import HTTPError
@@ -58,7 +60,13 @@
     WEIGHTS_INDEX_NAME,
     WEIGHTS_NAME,
 )
-from transformers.utils.import_utils import is_flax_available, is_tf_available, is_torchdynamo_available
+from transformers.utils.import_utils import (
+    is_flash_attn_2_available,
+    is_flax_available,
+    is_tf_available,
+    is_torch_sdpa_available,
+    is_torchdynamo_available,
+)
 
 
 sys.path.append(str(Path(__file__).parent.parent / "utils"))
@@ -83,7 +91,12 @@
         T5Config,
         T5ForConditionalGeneration,
     )
-    from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+    from transformers.modeling_attn_mask_utils import (
+        AttentionMaskConverter,
+        _create_4d_causal_attention_mask,
+        _prepare_4d_attention_mask,
+        _prepare_4d_causal_attention_mask,
+    )
     from transformers.modeling_utils import shard_checkpoint
 
     # Fake pretrained models for tests
@@ -148,6 +161,32 @@ def forward(self, x):
         def tie_weights(self):
             self.decoder.weight = self.base.linear.weight
 
+    class Prepare4dCausalAttentionMaskModel(nn.Module):
+        def forward(self, inputs_embeds):
+            batch_size, seq_length, _ = inputs_embeds.shape
+            past_key_values_length = 4
+            attention_mask = _prepare_4d_causal_attention_mask(
+                None, (batch_size, seq_length), inputs_embeds, past_key_values_length
+            )
+            return attention_mask
+
+    class Create4dCausalAttentionMaskModel(nn.Module):
+        def forward(self, inputs_embeds):
+            batch_size, seq_length, _ = inputs_embeds.shape
+            past_key_values_length = 4
+            attention_mask = _create_4d_causal_attention_mask(
+                (batch_size, seq_length),
+                dtype=inputs_embeds.dtype,
+                device=inputs_embeds.device,
+                past_key_values_length=past_key_values_length,
+            )
+            return attention_mask
+
+    class Prepare4dAttentionMaskModel(nn.Module):
+        def forward(self, mask, inputs_embeds):
+            attention_mask = _prepare_4d_attention_mask(mask, dtype=inputs_embeds.dtype)
+            return attention_mask
+
 
 if is_flax_available():
     from transformers import FlaxBertModel
@@ -829,14 +868,8 @@ def test_legacy_load_from_url(self):
 
     @require_safetensors
     def test_use_safetensors(self):
-        # test nice error message if no safetensor files available
-        with self.assertRaises(OSError) as env_error:
-            AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
-
-        self.assertTrue(
-            "model.safetensors or model.safetensors.index.json and thus cannot be loaded with `safetensors`"
-            in str(env_error.exception)
-        )
+        # Should not raise anymore
+        AutoModel.from_pretrained("hf-internal-testing/tiny-random-RobertaModel", use_safetensors=True)
 
         # test that error if only safetensors is available
         with self.assertRaises(OSError) as env_error:
@@ -1171,6 +1204,203 @@ def test_safetensors_torch_from_torch_sharded(self):
             self.assertTrue(torch.equal(p1, p2))
 
 
+@slow
+@require_torch
+class ModelOnTheFlyConversionTester(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        cls.user = "huggingface-hub-ci"
+        cls.token = os.getenv("HUGGINGFACE_PRODUCTION_USER_TOKEN", None)
+
+        if cls.token is None:
+            raise ValueError("Cannot run tests as secret isn't setup.")
+
+        cls.api = HfApi(token=cls.token)
+
+    def setUp(self) -> None:
+        self.repo_name = f"{self.user}/test-model-on-the-fly-{uuid.uuid4()}"
+
+    def tearDown(self) -> None:
+        self.api.delete_repo(self.repo_name)
+
+    def test_safetensors_on_the_fly_conversion(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False)
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, "SFconvertbot")
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_conversion_private(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False, private=True)
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name, token=self.token)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, self.user)
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_conversion_gated(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False)
+        headers = {"Authorization": f"Bearer {self.token}"}
+        requests.put(
+            f"https://huggingface.co/api/models/{self.repo_name}/settings", json={"gated": "auto"}, headers=headers
+        )
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, "SFconvertbot")
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_sharded_conversion(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False, max_shard_size="200kb")
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, "SFconvertbot")
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_sharded_conversion_private(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(
+            self.repo_name, token=self.token, safe_serialization=False, max_shard_size="200kb", private=True
+        )
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, self.user)
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_sharded_conversion_gated(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, max_shard_size="200kb", safe_serialization=False)
+        headers = {"Authorization": f"Bearer {self.token}"}
+        requests.put(
+            f"https://huggingface.co/api/models/{self.repo_name}/settings", json={"gated": "auto"}, headers=headers
+        )
+        converted_model = BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        with self.subTest("Initial and converted models are equal"):
+            for p1, p2 in zip(initial_model.parameters(), converted_model.parameters()):
+                self.assertTrue(torch.equal(p1, p2))
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, "SFconvertbot")
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+    @unittest.skip("Edge case, should work once the Space is updated`")
+    def test_safetensors_on_the_fly_wrong_user_opened_pr(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False, private=True)
+        BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        # This should have opened a PR with the user's account
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+            discussion = next(discussions)
+            self.assertEqual(discussion.author, self.user)
+            self.assertEqual(discussion.title, "Adding `safetensors` variant of this model")
+
+        # We now switch the repo visibility to public
+        self.api.update_repo_visibility(self.repo_name, private=False)
+
+        # We once again call from_pretrained, which should call the bot to open a PR
+        BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token)
+
+        with self.subTest("PR was open with the safetensors account"):
+            discussions = self.api.get_repo_discussions(self.repo_name)
+
+            bot_opened_pr = None
+            bot_opened_pr_title = None
+
+            for discussion in discussions:
+                if discussion.author == "SFconvertBot":
+                    bot_opened_pr = True
+                    bot_opened_pr_title = discussion.title
+
+            self.assertTrue(bot_opened_pr)
+            self.assertEqual(bot_opened_pr_title, "Adding `safetensors` variant of this model")
+
+    def test_safetensors_on_the_fly_specific_revision(self):
+        config = BertConfig(
+            vocab_size=99, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, intermediate_size=37
+        )
+        initial_model = BertModel(config)
+
+        # Push a model on `main`
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False)
+
+        # Push a model on a given revision
+        initial_model.push_to_hub(self.repo_name, token=self.token, safe_serialization=False, revision="new-branch")
+
+        # Try to convert the model on that revision should raise
+        with self.assertRaises(EnvironmentError):
+            BertModel.from_pretrained(self.repo_name, use_safetensors=True, token=self.token, revision="new-branch")
+
+
 @require_torch
 @is_staging_test
 class ModelPushToHubTester(unittest.TestCase):
@@ -1300,7 +1530,7 @@ def check_to_4d(self, mask_converter, q_len, kv_len, additional_mask=None, bsz=3
             for bsz_idx, seq_idx in additional_mask:
                 mask_2d[bsz_idx, seq_idx] = 0
 
-        mask_4d = mask_converter.to_4d(mask_2d, query_length=q_len, key_value_length=kv_len)
+        mask_4d = mask_converter.to_4d(mask_2d, query_length=q_len, key_value_length=kv_len, dtype=torch.float32)
 
         assert mask_4d.shape == (bsz, 1, q_len, kv_len)
 
@@ -1336,7 +1566,9 @@ def check_to_4d(self, mask_converter, q_len, kv_len, additional_mask=None, bsz=3
                 self.check_non_causal(bsz, q_len, kv_len, mask_2d, mask_4d)
 
     def check_to_causal(self, mask_converter, q_len, kv_len, bsz=3):
-        mask_4d = mask_converter.to_causal_4d(bsz, query_length=q_len, key_value_length=kv_len, device=torch_device)
+        mask_4d = mask_converter.to_causal_4d(
+            bsz, query_length=q_len, key_value_length=kv_len, device=torch_device, dtype=torch.float32
+        )
 
         if q_len == 1 and mask_converter.sliding_window is None:
             # no causal mask if q_len is 1
@@ -1428,3 +1660,193 @@ def test_causal_mask_sliding(self):
         self.check_to_causal(mask_converter, q_len=3, kv_len=7)
         # non auto-regressive case
         self.check_to_causal(mask_converter, q_len=7, kv_len=7)
+
+    def test_torch_compile_fullgraph(self):
+        model = Prepare4dCausalAttentionMaskModel()
+
+        inputs_embeds = torch.rand([1, 3, 32])
+        res_non_compiled = model(inputs_embeds)
+
+        compiled_model = torch.compile(model, fullgraph=True)
+
+        res_compiled = compiled_model(inputs_embeds)
+
+        self.assertTrue(torch.equal(res_non_compiled, res_compiled))
+
+        model = Create4dCausalAttentionMaskModel()
+
+        inputs_embeds = torch.rand(2, 4, 16)
+        res_non_compiled = model(inputs_embeds)
+
+        compiled_model = torch.compile(model, fullgraph=True)
+        res_compiled = compiled_model(inputs_embeds)
+
+        self.assertTrue(torch.equal(res_non_compiled, res_compiled))
+
+        model = Prepare4dAttentionMaskModel()
+
+        mask = torch.ones(2, 4)
+        mask[0, :2] = 0
+        inputs_embeds = torch.rand(2, 4, 16)
+
+        res_non_compiled = model(mask, inputs_embeds)
+
+        compiled_model = torch.compile(model, fullgraph=True)
+        res_compiled = compiled_model(mask, inputs_embeds)
+
+        self.assertTrue(torch.equal(res_non_compiled, res_compiled))
+
+    @require_torch
+    @slow
+    def test_unmask_unattended_left_padding(self):
+        attention_mask = torch.Tensor([[0, 0, 1], [1, 1, 1], [0, 1, 1]]).to(torch.int64)
+
+        expanded_mask = torch.Tensor(
+            [
+                [[[0, 0, 0], [0, 0, 0], [0, 0, 1]]],
+                [[[1, 0, 0], [1, 1, 0], [1, 1, 1]]],
+                [[[0, 0, 0], [0, 1, 0], [0, 1, 1]]],
+            ]
+        ).to(torch.int64)
+
+        reference_output = torch.Tensor(
+            [
+                [[[1, 1, 1], [1, 1, 1], [0, 0, 1]]],
+                [[[1, 0, 0], [1, 1, 0], [1, 1, 1]]],
+                [[[1, 1, 1], [0, 1, 0], [0, 1, 1]]],
+            ]
+        ).to(torch.int64)
+
+        result = AttentionMaskConverter._unmask_unattended(expanded_mask, attention_mask, unmasked_value=1)
+
+        self.assertTrue(torch.equal(result, reference_output))
+
+        attention_mask = torch.Tensor([[0, 0, 1, 1, 1], [1, 1, 1, 1, 1], [0, 1, 1, 1, 1]]).to(torch.int64)
+
+        attn_mask_converter = AttentionMaskConverter(is_causal=True)
+        past_key_values_length = 0
+        key_value_length = attention_mask.shape[-1] + past_key_values_length
+
+        expanded_mask = attn_mask_converter.to_4d(
+            attention_mask, attention_mask.shape[-1], key_value_length=key_value_length, dtype=torch.float32
+        )
+
+        result = AttentionMaskConverter._unmask_unattended(expanded_mask, attention_mask, unmasked_value=0)
+        min_inf = torch.finfo(torch.float32).min
+        reference_output = torch.Tensor(
+            [
+                [
+                    [
+                        [0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0],
+                        [min_inf, min_inf, 0, min_inf, min_inf],
+                        [min_inf, min_inf, 0, 0, min_inf],
+                        [min_inf, min_inf, 0, 0, 0],
+                    ]
+                ],
+                [
+                    [
+                        [0, min_inf, min_inf, min_inf, min_inf],
+                        [0, 0, min_inf, min_inf, min_inf],
+                        [0, 0, 0, min_inf, min_inf],
+                        [0, 0, 0, 0, min_inf],
+                        [0, 0, 0, 0, 0],
+                    ]
+                ],
+                [
+                    [
+                        [0, 0, 0, 0, 0],
+                        [min_inf, 0, min_inf, min_inf, min_inf],
+                        [min_inf, 0, 0, min_inf, min_inf],
+                        [min_inf, 0, 0, 0, min_inf],
+                        [min_inf, 0, 0, 0, 0],
+                    ]
+                ],
+            ]
+        )
+
+        self.assertTrue(torch.equal(reference_output, result))
+
+    @require_torch
+    @slow
+    def test_unmask_unattended_right_padding(self):
+        attention_mask = torch.Tensor([[1, 1, 1, 0], [1, 1, 1, 1], [1, 1, 0, 0]]).to(torch.int64)
+
+        attn_mask_converter = AttentionMaskConverter(is_causal=True)
+        past_key_values_length = 0
+        key_value_length = attention_mask.shape[-1] + past_key_values_length
+
+        expanded_mask = attn_mask_converter.to_4d(
+            attention_mask, attention_mask.shape[-1], key_value_length=key_value_length, dtype=torch.float32
+        )
+
+        result = AttentionMaskConverter._unmask_unattended(expanded_mask, attention_mask, unmasked_value=0)
+
+        self.assertTrue(torch.equal(expanded_mask, result))
+
+    @require_torch
+    @slow
+    def test_unmask_unattended_random_mask(self):
+        attention_mask = torch.Tensor([[1, 0, 1, 0], [1, 0, 1, 1], [1, 1, 0, 1]]).to(torch.int64)
+
+        attn_mask_converter = AttentionMaskConverter(is_causal=True)
+        past_key_values_length = 0
+        key_value_length = attention_mask.shape[-1] + past_key_values_length
+
+        expanded_mask = attn_mask_converter.to_4d(
+            attention_mask, attention_mask.shape[-1], key_value_length=key_value_length, dtype=torch.float32
+        )
+
+        result = AttentionMaskConverter._unmask_unattended(expanded_mask, attention_mask, unmasked_value=0)
+
+        self.assertTrue(torch.equal(expanded_mask, result))
+
+
+@require_torch
+class TestAttentionImplementation(unittest.TestCase):
+    def test_error_no_sdpa_available(self):
+        with self.assertRaises(ValueError) as cm:
+            _ = AutoModel.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="sdpa")
+
+        self.assertTrue(
+            "does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention"
+            in str(cm.exception)
+        )
+
+        _ = AutoModel.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel")
+
+    def test_error_no_flash_available(self):
+        with self.assertRaises(ValueError) as cm:
+            _ = AutoModel.from_pretrained(
+                "hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="flash_attention_2"
+            )
+
+        self.assertTrue("does not support Flash Attention 2.0" in str(cm.exception))
+
+    def test_error_wrong_attn_implementation(self):
+        with self.assertRaises(ValueError) as cm:
+            _ = AutoModel.from_pretrained("hf-tiny-model-private/tiny-random-MCTCTModel", attn_implementation="foo")
+
+        self.assertTrue('The only possible arguments are `attn_implementation="eager"' in str(cm.exception))
+
+    def test_not_available_flash(self):
+        if is_flash_attn_2_available():
+            self.skipTest("Please uninstall flash-attn package to run test_not_available_flash")
+
+        with self.assertRaises(ImportError) as cm:
+            _ = AutoModel.from_pretrained(
+                "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="flash_attention_2"
+            )
+
+        self.assertTrue("the package flash_attn seems to be not installed" in str(cm.exception))
+
+    def test_not_available_sdpa(self):
+        if is_torch_sdpa_available():
+            self.skipTest("This test requires torch<=2.0")
+
+        with self.assertRaises(ImportError) as cm:
+            _ = AutoModel.from_pretrained(
+                "hf-internal-testing/tiny-random-GPTBigCodeModel", attn_implementation="sdpa"
+            )
+
+        self.assertTrue("PyTorch SDPA requirements in Transformers are not met" in str(cm.exception))
diff --git a/tests/trainer/test_trainer.py b/tests/trainer/test_trainer.py
index b9f801fabd7f1e..05f84bc00fa2af 100644
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -38,7 +38,9 @@
     AutoTokenizer,
     IntervalStrategy,
     PretrainedConfig,
+    TrainerCallback,
     TrainingArguments,
+    get_polynomial_decay_schedule_with_warmup,
     is_torch_available,
     logging,
 )
@@ -78,7 +80,8 @@
     slow,
     torch_device,
 )
-from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend
+from transformers.tokenization_utils_base import PreTrainedTokenizerBase
+from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, HPSearchBackend, get_last_checkpoint
 from transformers.training_args import OptimizerNames
 from transformers.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
@@ -643,6 +646,33 @@ def test_custom_optimizer(self):
         self.assertFalse(torch.allclose(trainer.model.b, b))
         self.assertEqual(trainer.optimizer.state_dict()["param_groups"][0]["lr"], 1.0)
 
+    def test_lr_scheduler_kwargs(self):
+        # test scheduler kwargs passed via TrainingArguments
+        train_dataset = RegressionDataset()
+        model = RegressionModel()
+        num_steps, num_warmup_steps = 10, 2
+        extra_kwargs = {"power": 5.0, "lr_end": 1e-5}  # Non-default arguments
+        args = TrainingArguments(
+            "./regression",
+            lr_scheduler_type="polynomial",
+            lr_scheduler_kwargs=extra_kwargs,
+            learning_rate=0.2,
+            warmup_steps=num_warmup_steps,
+        )
+        trainer = Trainer(model, args, train_dataset=train_dataset)
+        trainer.create_optimizer_and_scheduler(num_training_steps=num_steps)
+
+        # Checking that the scheduler was created
+        self.assertIsNotNone(trainer.lr_scheduler)
+
+        # Checking that the correct args were passed
+        sched1 = trainer.lr_scheduler
+        sched2 = get_polynomial_decay_schedule_with_warmup(
+            trainer.optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_steps, **extra_kwargs
+        )
+        self.assertEqual(sched1.lr_lambdas[0].args, sched2.lr_lambdas[0].args)
+        self.assertEqual(sched1.lr_lambdas[0].keywords, sched2.lr_lambdas[0].keywords)
+
     def test_reduce_lr_on_plateau_args(self):
         # test passed arguments for a custom ReduceLROnPlateau scheduler
         train_dataset = RegressionDataset(length=64)
@@ -672,7 +702,7 @@ class TrainerWithLRLogs(Trainer):
             def log(self, logs):
                 # the LR is computed after metrics and does not exist for the first epoch
                 if hasattr(self.lr_scheduler, "_last_lr"):
-                    logs["learning_rate"] = self.lr_scheduler._last_lr
+                    logs["learning_rate"] = self.lr_scheduler._last_lr[0]
                 super().log(logs)
 
         train_dataset = RegressionDataset(length=64)
@@ -702,14 +732,14 @@ def log(self, logs):
             if loss > best_loss:
                 bad_epochs += 1
                 if bad_epochs > patience:
-                    self.assertLess(logs[i + 1]["learning_rate"][0], log["learning_rate"][0])
+                    self.assertLess(logs[i + 1]["learning_rate"], log["learning_rate"])
                     just_decreased = True
                     bad_epochs = 0
             else:
                 best_loss = loss
                 bad_epochs = 0
             if not just_decreased:
-                self.assertEqual(logs[i + 1]["learning_rate"][0], log["learning_rate"][0])
+                self.assertEqual(logs[i + 1]["learning_rate"], log["learning_rate"])
 
     def test_adafactor_lr_none(self):
         # test the special case where lr=None, since Trainer can't not have lr_scheduler
@@ -1282,6 +1312,19 @@ def test_save_checkpoints(self):
             trainer.train()
             self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
 
+    def test_save_checkpoints_is_atomic(self):
+        class UnsaveableTokenizer(PreTrainedTokenizerBase):
+            def save_pretrained(self, *args, **kwargs):
+                raise OSError("simulated file write error")
+
+        with tempfile.TemporaryDirectory() as tmpdir:
+            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
+            # Attach unsaveable tokenizer to partially fail checkpointing
+            trainer.tokenizer = UnsaveableTokenizer()
+            with self.assertRaises(OSError) as _context:
+                trainer.train()
+            assert get_last_checkpoint(tmpdir) is None
+
     @require_safetensors
     def test_safe_checkpoints(self):
         for save_safetensors in [True, False]:
@@ -1504,6 +1547,41 @@ def test_auto_batch_size_finder(self):
         with patch.object(sys, "argv", testargs):
             run_glue.main()
 
+    def test_auto_batch_size_with_resume_from_checkpoint(self):
+        train_dataset = RegressionDataset(length=128)
+
+        config = RegressionModelConfig(a=0, b=2)
+        model = RegressionRandomPreTrainedModel(config)
+
+        tmp_dir = self.get_auto_remove_tmp_dir()
+
+        class MockCudaOOMCallback(TrainerCallback):
+            def on_step_end(self, args, state, control, **kwargs):
+                # simulate OOM on the first step
+                if state.train_batch_size >= 16:
+                    raise RuntimeError("CUDA out of memory.")
+
+        args = RegressionTrainingArguments(
+            tmp_dir,
+            do_train=True,
+            max_steps=2,
+            save_steps=1,
+            per_device_train_batch_size=16,
+            auto_find_batch_size=True,
+        )
+        trainer = Trainer(model, args, train_dataset=train_dataset, callbacks=[MockCudaOOMCallback()])
+        trainer.train()
+        # After `auto_find_batch_size` is ran we should now be at 8
+        self.assertEqual(trainer._train_batch_size, 8)
+
+        # We can then make a new Trainer
+        trainer = Trainer(model, args, train_dataset=train_dataset)
+        # Check we are at 16 to start
+        self.assertEqual(trainer._train_batch_size, 16 * max(trainer.args.n_gpu, 1))
+        trainer.train(resume_from_checkpoint=True)
+        # We should be back to 8 again, picking up based upon the last ran Trainer
+        self.assertEqual(trainer._train_batch_size, 8)
+
     # regression for this issue: https://github.com/huggingface/transformers/issues/12970
     def test_training_with_resume_from_checkpoint_false(self):
         train_dataset = RegressionDataset(length=128)
diff --git a/tests/utils/test_doc_samples.py b/tests/utils/test_doc_samples.py
index 84c5a4d2bf5008..953654537843ee 100644
--- a/tests/utils/test_doc_samples.py
+++ b/tests/utils/test_doc_samples.py
@@ -12,11 +12,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import doctest
 import logging
 import os
 import unittest
+from glob import glob
 from pathlib import Path
 from typing import List, Union
 
@@ -27,6 +27,63 @@
 logger = logging.getLogger()
 
 
+@require_torch
+class TestDocLists(unittest.TestCase):
+    def test_flash_support_list(self):
+        with open("./docs/source/en/perf_infer_gpu_one.md", "r") as f:
+            doctext = f.read()
+
+            doctext = doctext.split("FlashAttention-2 is currently supported for the following architectures:")[1]
+            doctext = doctext.split("You can request to add FlashAttention-2 support")[0]
+
+        patterns = glob("./src/transformers/models/**/modeling_*.py")
+        patterns_tf = glob("./src/transformers/models/**/modeling_tf_*.py")
+        patterns_flax = glob("./src/transformers/models/**/modeling_flax_*.py")
+        patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax))
+        archs_supporting_fa2 = []
+        for filename in patterns:
+            with open(filename, "r") as f:
+                text = f.read()
+
+                if "_supports_flash_attn_2 = True" in text:
+                    model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "")
+                    archs_supporting_fa2.append(model_name)
+
+        for arch in archs_supporting_fa2:
+            if arch not in doctext:
+                raise ValueError(
+                    f"{arch} should be in listed in the flash attention documentation but is not. Please update the documentation."
+                )
+
+    def test_sdpa_support_list(self):
+        with open("./docs/source/en/perf_infer_gpu_one.md", "r") as f:
+            doctext = f.read()
+
+            doctext = doctext.split(
+                "For now, Transformers supports inference and training through SDPA for the following architectures:"
+            )[1]
+            doctext = doctext.split("Note that FlashAttention can only be used for models using the")[0]
+
+        patterns = glob("./src/transformers/models/**/modeling_*.py")
+        patterns_tf = glob("./src/transformers/models/**/modeling_tf_*.py")
+        patterns_flax = glob("./src/transformers/models/**/modeling_flax_*.py")
+        patterns = list(set(patterns) - set(patterns_tf) - set(patterns_flax))
+        archs_supporting_sdpa = []
+        for filename in patterns:
+            with open(filename, "r") as f:
+                text = f.read()
+
+                if "_supports_sdpa = True" in text:
+                    model_name = os.path.basename(filename).replace(".py", "").replace("modeling_", "")
+                    archs_supporting_sdpa.append(model_name)
+
+        for arch in archs_supporting_sdpa:
+            if arch not in doctext:
+                raise ValueError(
+                    f"{arch} should be in listed in the SDPA documentation but is not. Please update the documentation."
+                )
+
+
 @unittest.skip("Temporarily disable the doc tests.")
 @require_torch
 @require_tf
diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json
index 2a1efa7d88d283..5f2c6c0b4e7438 100644
--- a/tests/utils/tiny_model_summary.json
+++ b/tests/utils/tiny_model_summary.json
@@ -877,6 +877,16 @@
         ],
         "sha": "a7874595b900f9b2ddc79130dafc3ff48f4fbfb9"
     },
+    "ClvpModelForConditionalGeneration": {
+        "tokenizer_classes": [
+            "ClvpTokenizer"
+        ],
+        "processor_classes": [
+            "ClvpFeatureExtractor"
+        ],
+        "model_classes": [],
+        "sha": "45df7581535be337ff781707b6c20994ca221f05"
+    },
     "CodeGenForCausalLM": {
         "tokenizer_classes": [
             "CodeGenTokenizer",
@@ -1039,7 +1049,8 @@
             "ConvNextImageProcessor"
         ],
         "model_classes": [
-            "ConvNextV2ForImageClassification"
+            "ConvNextV2ForImageClassification",
+            "TFConvNextV2ForImageClassification"
         ],
         "sha": "ee22bae1cbb87d66fc7f62f7e15a43d6ff80d3cc"
     },
@@ -1049,7 +1060,8 @@
             "ConvNextImageProcessor"
         ],
         "model_classes": [
-            "ConvNextV2Model"
+            "ConvNextV2Model",
+            "TFConvNextV2Model"
         ],
         "sha": "c4dd68ee1102cba05bcc483da2a88e39427b7249"
     },
@@ -2136,6 +2148,56 @@
         ],
         "sha": "683f6f73a2ab87801f1695a72d1af63cf173ab7c"
     },
+    "FalconForCausalLM": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForCausalLM"
+        ],
+        "sha": "60076d5dafc5e33ba9c90dcd05e7c0834e44049a"
+    },
+    "FalconForQuestionAnswering": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForQuestionAnswering"
+        ],
+        "sha": "b1ee9cd5fad2d177ea5a46df4611cd02f66ae788"
+    },
+    "FalconForSequenceClassification": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForSequenceClassification"
+        ],
+        "sha": "007838c0991c2b6a87dc49a8a5c20f29149a00fa"
+    },
+    "FalconForTokenClassification": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconForTokenClassification"
+        ],
+        "sha": "0ea6ae548773daa6e3317fddc058957e956eebf4"
+    },
+    "FalconModel": {
+        "tokenizer_classes": [
+            "PreTrainedTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "FalconModel"
+        ],
+        "sha": "ca15a579c946eb00c5b39cc8e0ea63d0c1460f84"
+    },
     "FlaubertForMultipleChoice": {
         "tokenizer_classes": [
             "FlaubertTokenizer"
@@ -2364,6 +2426,18 @@
         ],
         "sha": "bfbaa8fa21c3abf80b94e7168b5ecff8ec5b5f76"
     },
+    "FuyuForCausalLM": {
+        "tokenizer_classes": [
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [
+            "FuyuImageProcessor"
+        ],
+        "model_classes": [
+            "FuyuForCausalLM"
+        ],
+        "sha": "685d78258ea95c5c82e0e4555d0d4a2270ab8bff"
+    },
     "GLPNForDepthEstimation": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -2866,6 +2940,30 @@
         ],
         "sha": "5a7983e48d5841704733dd0756177680ed50c074"
     },
+    "Kosmos2ForConditionalGeneration": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "Kosmos2ForConditionalGeneration"
+        ],
+        "sha": "d1d4607782b911411676f1ee79997dee645def58"
+    },
+    "Kosmos2Model": {
+        "tokenizer_classes": [
+            "XLMRobertaTokenizerFast"
+        ],
+        "processor_classes": [
+            "CLIPImageProcessor"
+        ],
+        "model_classes": [
+            "Kosmos2Model"
+        ],
+        "sha": "379d8944a65312094d9ab1c4b8a82058a2d3274e"
+    },
     "LEDForConditionalGeneration": {
         "tokenizer_classes": [
             "LEDTokenizer",
@@ -3820,6 +3918,39 @@
         ],
         "sha": "f197d5bfa1fe27b5f28a6e6d4e3ad229b753450a"
     },
+    "MistralForCausalLM": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralForCausalLM"
+        ],
+        "sha": "f7e06aeedbba8f4f665b438b868ed932d451f64b"
+    },
+    "MistralForSequenceClassification": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralForSequenceClassification"
+        ],
+        "sha": "65045444ea1933309270d8b08b21d3fa94a84290"
+    },
+    "MistralModel": {
+        "tokenizer_classes": [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "MistralModel"
+        ],
+        "sha": "becd727ad72b1e8a7c0fa0ea39b61904fa68aeac"
+    },
     "MobileBertForMaskedLM": {
         "tokenizer_classes": [
             "MobileBertTokenizer",
@@ -4558,6 +4689,32 @@
         ],
         "sha": "f0e27b2b4e53ba70e05d13dcfea8e85272b292a5"
     },
+    "Owlv2ForObjectDetection": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "Owlv2ImageProcessor"
+        ],
+        "model_classes": [
+            "Owlv2ForObjectDetection"
+        ],
+        "sha": "30439c0b2749726468dc13a755261e8101170052"
+    },
+    "Owlv2Model": {
+        "tokenizer_classes": [
+            "CLIPTokenizer",
+            "CLIPTokenizerFast"
+        ],
+        "processor_classes": [
+            "Owlv2ImageProcessor"
+        ],
+        "model_classes": [
+            "Owlv2Model"
+        ],
+        "sha": "7aeebdad5f72b36cb07c74355afad8e6052e2377"
+    },
     "PLBartForCausalLM": {
         "tokenizer_classes": [
             "PLBartTokenizer"
@@ -4760,6 +4917,50 @@
         ],
         "sha": "b8c8d479e29e9ee048e2d0b05b001ac835ad8859"
     },
+    "PhiForCausalLM": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForCausalLM"
+        ],
+        "sha": "3fecc0109a4a3a230e3a5509eaf47a26eba85d79"
+    },
+    "PhiForSequenceClassification": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForSequenceClassification"
+        ],
+        "sha": "e1c9f8ebf1317516acc1cd6338de71a53e770245"
+    },
+    "PhiForTokenClassification": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiForTokenClassification"
+        ],
+        "sha": "d3a8054903753b5c96c05eaf9877905a116a1d5e"
+    },
+    "PhiModel": {
+        "tokenizer_classes": [
+            "CodeGenTokenizer",
+            "CodeGenTokenizerFast"
+        ],
+        "processor_classes": [],
+        "model_classes": [
+            "PhiModel"
+        ],
+        "sha": "99c38d5ce7ace35127d00ed3eeb3561308ea6b21"
+    },
     "Pix2StructForConditionalGeneration": {
         "tokenizer_classes": [
             "T5TokenizerFast"
@@ -4768,7 +4969,9 @@
             "Pix2StructImageProcessor",
             "Pix2StructProcessor"
         ],
-        "model_classes": [],
+        "model_classes": [
+            "Pix2StructForConditionalGeneration"
+        ],
         "sha": "42b3de00ad535076c4893e4ac5ae2d2748cc4ccb"
     },
     "PoolFormerForImageClassification": {
@@ -5691,6 +5894,16 @@
         ],
         "sha": "25ba2d88c770533f8c69811d2a454a00c1d09f5d"
     },
+    "Swin2SRForImageSuperResolution": {
+        "tokenizer_classes": [],
+        "processor_classes": [
+            "Swin2SRImageProcessor"
+        ],
+        "model_classes": [
+            "Swin2SRForImageSuperResolution"
+        ],
+        "sha": "3a2780de0b455084c018ac8a62b56040969e26ec"
+    },
     "Swin2SRModel": {
         "tokenizer_classes": [],
         "processor_classes": [
@@ -6625,6 +6838,18 @@
         ],
         "sha": "d71b13674b1a67443cd19d0594a3b5b1e5968f0d"
     },
+    "WhisperForCausalLM": {
+        "tokenizer_classes": [
+            "WhisperTokenizer"
+        ],
+        "processor_classes": [
+            "WhisperFeatureExtractor"
+        ],
+        "model_classes": [
+            "WhisperForCausalLM"
+        ],
+        "sha": "e7febfd7f4512e029293c677e6d2633e23fc459a"
+    },
     "WhisperForConditionalGeneration": {
         "tokenizer_classes": [
             "WhisperTokenizer",
diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py
index 7a116d5af3178b..e1682fb07589ba 100644
--- a/utils/check_config_attributes.py
+++ b/utils/check_config_attributes.py
@@ -96,6 +96,21 @@
         "t2u_encoder_layers",
         "t2u_max_position_embeddings",
     ],
+    # Actually used in the config or generation config, in that case necessary for the sub-components generation
+    "SeamlessM4Tv2Config": [
+        "max_new_tokens",
+        "t2u_decoder_attention_heads",
+        "t2u_decoder_ffn_dim",
+        "t2u_decoder_layers",
+        "t2u_encoder_attention_heads",
+        "t2u_encoder_ffn_dim",
+        "t2u_encoder_layers",
+        "t2u_max_position_embeddings",
+        "t2u_variance_pred_dropout",
+        "t2u_variance_predictor_embed_dim",
+        "t2u_variance_predictor_hidden_dim",
+        "t2u_variance_predictor_kernel_size",
+    ],
 }
 
 
@@ -127,7 +142,6 @@
         "SwitchTransformersConfig": True,
         "TableTransformerConfig": True,
         "TapasConfig": True,
-        "TransfoXLConfig": True,
         "UniSpeechConfig": True,
         "UniSpeechSatConfig": True,
         "WavLMConfig": True,
diff --git a/utils/check_copies.py b/utils/check_copies.py
index c536eb800ff084..3d352cc8310e4f 100644
--- a/utils/check_copies.py
+++ b/utils/check_copies.py
@@ -41,7 +41,8 @@
 import os
 import re
 import subprocess
-from typing import List, Optional, Tuple
+from collections import OrderedDict
+from typing import List, Optional, Tuple, Union
 
 from transformers.utils import direct_transformers_import
 
@@ -125,13 +126,213 @@
 transformers_module = direct_transformers_import(TRANSFORMERS_PATH)
 
 
+def _is_definition_header_ending_line(line: str) -> bool:
+    # Helper function. Returns `True` if `line` is the end parenthesis of a class/function definition
+    return re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
+
+
 def _should_continue(line: str, indent: str) -> bool:
     # Helper function. Returns `True` if `line` is empty, starts with the `indent` or is the end parenthesis of a
-    # function definition
-    return line.startswith(indent) or len(line.strip()) == 0 or re.search(r"^\s*\)(\s*->.*:|:)\s*$", line) is not None
+    # class/function definition
+    return line.startswith(indent) or len(line.strip()) == 0 or _is_definition_header_ending_line(line)
+
+
+def _sanity_check_splits(splits_1, splits_2, is_class):
+    """Check the two (inner) block structures of the corresponding code block given by `split_code_into_blocks` match.
+
+    For the case of `class`, they must be of one of the following 3 cases:
+
+        - a single block without name:
+
+            class foo:
+                a = 1
+
+        - a consecutive sequence of (1 or more) blocks with name
+
+            class foo:
+
+                def f(x):
+                    return x
+
+        - a block without name, followed by a consecutive sequence of (1 or more) blocks with name
+
+            class foo:
+                a = 1
+
+                def f(x):
+                    return x
+
+                def g(x):
+                    return None
+
+    The 2 code snippets that give `splits_1` and `splits_2` have to be in the same case to pass this check, but the
+    number of blocks with name in the consecutive sequence is not taken into account.
+
+    For the case of `function or method`, we don't require it to be in one of the above 3 cases. However, the structure
+    of`splits_1` and `splits_2` have to match exactly. In particular, the number of blocks with name in a consecutive
+    sequence is taken into account.
+    """
+    block_names_1 = []
+    block_names_2 = []
+
+    for block in splits_1[1:]:
+        if block[0].startswith("_block_without_name_"):
+            block_names_1.append("block_without_name")
+        elif not block[0].startswith("_empty_block_") and (
+            not is_class or len(block_names_1) == 0 or block_names_1[-1].startswith("block_without_name")
+        ):
+            block_names_1.append("block_with_name")
+
+    for block in splits_2[1:]:
+        if block[0].startswith("_block_without_name_"):
+            block_names_2.append("block_without_name")
+        elif not block[0].startswith("_empty_block_") and (
+            not is_class or len(block_names_2) == 0 or block_names_2[-1].startswith("block_without_name")
+        ):
+            block_names_2.append("block_with_name")
+
+    if is_class:
+        if block_names_1 not in [
+            ["block_without_name"],
+            ["block_with_name"],
+            ["block_without_name", "block_with_name"],
+        ]:
+            raise ValueError(
+                "For a class, it must have a specific structure. See the docstring of `_sanity_check_splits` in the file `utils/check_copies.py`"
+            )
+
+    if block_names_1 != block_names_2:
+        raise ValueError("The structures in the 2 code blocks differ.")
+
+
+def find_block_end(lines: List[str], start_index: int, indent: int) -> int:
+    """
+    Find the end of the class/func block starting at `start_index` in a source code (defined by `lines`).
+
+    Args:
+        lines (`List[str]`):
+            The source code, represented by a list of lines.
+        start_index (`int`):
+            The starting index of the target class/func block.
+        indent (`int`):
+            The indent of the class/func body.
+
+    Returns:
+        `int`: The index of the block's ending line plus by 1 (i.e. exclusive).
+    """
+    indent = " " * indent
+    # enter the block body
+    line_index = start_index + 1
+
+    while line_index < len(lines) and _should_continue(lines[line_index], indent):
+        line_index += 1
+    # Clean up empty lines at the end (if any).
+    while len(lines[line_index - 1]) <= 1:
+        line_index -= 1
+
+    return line_index
+
+
+def split_code_into_blocks(
+    lines: List[str], start_index: int, end_index: int, indent: int, backtrace: bool = False
+) -> List[Tuple[str, int, int]]:
+    """
+    Split the class/func block starting at `start_index` in a source code (defined by `lines`) into *inner blocks*.
+
+    The block's header is included as the first element. The contiguous regions (without empty lines) that are not
+    inside any inner block are included as blocks. The contiguous regions of empty lines that are not inside any inner
+    block are also included as (dummy) blocks.
 
+    Args:
+        lines (`List[str]`):
+            The source code, represented by a list of lines.
+        start_index (`int`):
+            The starting index of the target class/func block.
+        end_index (`int`):
+            The ending index of the target class/func block.
+        indent (`int`):
+            The indent of the class/func body.
+        backtrace (`bool`, *optional*, defaults to `False`):
+            Whether or not to include the lines before the inner class/func block's header (e.g. comments, decorators,
+            etc.) until an empty line is encountered.
 
-def find_code_in_transformers(object_name: str, base_path: str = None) -> str:
+    Returns:
+        `List[Tuple[str, int, int]]`: A list of elements with the form `(block_name, start_index, end_index)`.
+    """
+    splits = []
+    # `indent - 4` is the indent level of the target class/func header
+    target_block_name = re.search(rf"^{' ' * (indent - 4)}((class|def)\s+\S+)(\(|\:)", lines[start_index]).groups()[0]
+
+    # from now on, the `block` means inner blocks unless explicitly specified
+    indent_str = " " * indent
+    block_without_name_idx = 0
+    empty_block_idx = 0
+
+    # Find the lines for the definition header
+    index = start_index
+    if "(" in lines[start_index] and "):" not in lines[start_index] in lines[start_index]:
+        while index < end_index:
+            if _is_definition_header_ending_line(lines[index]):
+                break
+            index += 1
+
+    # the first line outside the definition header
+    index += 1
+    splits.append((target_block_name, start_index, index))
+
+    block_start_index, prev_block_end_index = index, index
+    while index < end_index:
+        # if found, it will be an inner block
+        block_found = re.search(rf"^{indent_str}((class|def)\s+\S+)(\(|\:)", lines[index])
+        if block_found:
+            name = block_found.groups()[0]
+
+            block_end_index = find_block_end(lines, index, indent + 4)
+
+            # backtrace to include the lines before the found block's definition header (e.g. comments, decorators,
+            # etc.) until an empty line is encountered.
+            block_start_index = index
+            if index > prev_block_end_index and backtrace:
+                idx = index - 1
+                for idx in range(index - 1, prev_block_end_index - 2, -1):
+                    if not (len(lines[idx].strip()) > 0 and lines[idx].startswith(indent_str)):
+                        break
+                idx += 1
+                if idx < index:
+                    block_start_index = idx
+
+            # between the current found block and the previous found block
+            if block_start_index > prev_block_end_index:
+                # give it a dummy name
+                if len("".join(lines[prev_block_end_index:block_start_index]).strip()) == 0:
+                    prev_block_name = f"_empty_block_{empty_block_idx}"
+                    empty_block_idx += 1
+                else:
+                    prev_block_name = f"_block_without_name_{block_without_name_idx}"
+                    block_without_name_idx += 1
+                # Add it as a block
+                splits.append((prev_block_name, prev_block_end_index, block_start_index))
+
+            # Add the current found block
+            splits.append((name, block_start_index, block_end_index))
+            prev_block_end_index = block_end_index
+            index = block_end_index - 1
+
+        index += 1
+
+    if index > prev_block_end_index:
+        if len("".join(lines[prev_block_end_index:index]).strip()) == 0:
+            prev_block_name = f"_empty_block_{empty_block_idx}"
+        else:
+            prev_block_name = f"_block_without_name_{block_without_name_idx}"
+        splits.append((prev_block_name, prev_block_end_index, index))
+
+    return splits
+
+
+def find_code_in_transformers(
+    object_name: str, base_path: str = None, return_indices: bool = False
+) -> Union[str, Tuple[List[str], int, int]]:
     """
     Find and return the source code of an object.
 
@@ -140,9 +341,15 @@ def find_code_in_transformers(object_name: str, base_path: str = None) -> str:
             The name of the object we want the source code of.
         base_path (`str`, *optional*):
             The path to the base folder where files are checked. If not set, it will be set to `TRANSFORMERS_PATH`.
+        return_indices(`bool`, *optional*, defaults to `False`):
+            If `False`, will only return the code (as a string), otherwise it will also return the whole lines of the
+            file where the object specified by `object_name` is defined, together the start/end indices of the block in
+            the file that defines the object.
 
     Returns:
-        `str`: The source code of the object.
+        `Union[str, Tuple[List[str], int, int]]`: If `return_indices=False`, only the source code of the object will be
+        returned. Otherwise, it also returns the whole lines of the file where the object specified by `object_name` is
+        defined, together the start/end indices of the block in the file that defines the object.
     """
     parts = object_name.split(".")
     i = 0
@@ -181,22 +388,91 @@ def find_code_in_transformers(object_name: str, base_path: str = None) -> str:
             line_index < len(lines) and re.search(rf"^{indent}(class|def)\s+{name}(\(|\:)", lines[line_index]) is None
         ):
             line_index += 1
+        # find the target specified in the current level in `parts` -> increase `indent` so we can search the next
         indent += "    "
+        # the index of the first line in the (currently found) block *body*
         line_index += 1
 
     if line_index >= len(lines):
         raise ValueError(f" {object_name} does not match any function or class in {module}.")
 
+    # `indent` is already one level deeper than the (found) class/func block's definition header
+
     # We found the beginning of the class / func, now let's find the end (when the indent diminishes).
+    # `start_index` is the index of the class/func block's definition header
     start_index = line_index - 1
-    while line_index < len(lines) and _should_continue(lines[line_index], indent):
-        line_index += 1
-    # Clean up empty lines at the end (if any).
-    while len(lines[line_index - 1]) <= 1:
-        line_index -= 1
+    end_index = find_block_end(lines, start_index, len(indent))
+
+    code = "".join(lines[start_index:end_index])
+    return (code, (lines, start_index, end_index)) if return_indices else code
+
+
+def replace_code(code: str, replace_pattern: str) -> str:
+    """Replace `code` by a pattern of the form `with X1->X2,Y1->Y2,Z1->Z2`.
+
+    Args:
+        code (`str`): The code to be modified.
+        replace_pattern (`str`): The pattern used to modify `code`.
+
+    Returns:
+        `str`: The modified code.
+    """
+    if len(replace_pattern) > 0:
+        patterns = replace_pattern.replace("with", "").split(",")
+        patterns = [_re_replace_pattern.search(p) for p in patterns]
+        for pattern in patterns:
+            if pattern is None:
+                continue
+            obj1, obj2, option = pattern.groups()
+            code = re.sub(obj1, obj2, code)
+            if option.strip() == "all-casing":
+                code = re.sub(obj1.lower(), obj2.lower(), code)
+                code = re.sub(obj1.upper(), obj2.upper(), code)
+
+    return code
+
 
-    code_lines = lines[start_index:line_index]
-    return "".join(code_lines)
+def find_code_and_splits(object_name: str, base_path: str, buffer: dict = None):
+    """Find the code of an object (specified by `object_name`) and split it into blocks.
+
+    Args:
+        object_name (`str`):
+            The name of the object, e.g. `transformers.models.bert.modeling_bert.BertAttention` or
+            `tests.models.llama.test_modeling_llama.LlamaModelTest.test_config`.
+        base_path (`str`):
+            The path to the base directory within which the search will be performed. It could be either
+            `TRANSFORMERS_PATH` or `MODEL_TEST_PATH`.
+        buffer (`dict`, *optional*):
+            The buffer used to store the previous results in order to speed up the process.
+
+    Returns:
+        lines (`List[str]`):
+            The lines of the whole file where the object is defined.
+        code (`str`):
+            The object's code.
+        code_splits (`List[Tuple[str, int, int]]`):
+            `code` splitted into blocks. See `split_code_into_blocks`.
+    """
+    if buffer is None:
+        buffer = {}
+
+    if (object_name, base_path) in buffer:
+        lines, code, code_splits = buffer[(object_name, base_path)]
+    else:
+        code, (lines, target_start_index, target_end_index) = find_code_in_transformers(
+            object_name, base_path=base_path, return_indices=True
+        )
+        indent = get_indent(code)
+
+        # Split the code into blocks
+        # `indent` is the indent of the class/func definition header, but `code_splits` expects the indent level of the
+        # block body.
+        code_splits = split_code_into_blocks(
+            lines, target_start_index, target_end_index, len(indent) + 4, backtrace=True
+        )
+        buffer[(object_name, base_path)] = lines, code, code_splits
+
+    return lines, code, code_splits
 
 
 _re_copy_warning = re.compile(r"^(\s*)#\s*Copied from\s+transformers\.(\S+\.\S+)\s*($|\S.*$)")
@@ -285,7 +561,7 @@ def check_codes_match(observed_code: str, theoretical_code: str) -> Optional[int
         diff_index += 1
 
 
-def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[Tuple[str, int]]]:
+def is_copy_consistent(filename: str, overwrite: bool = False, buffer: dict = None) -> Optional[List[Tuple[str, int]]]:
     """
     Check if the code commented as a copy in a file matches the original.
 
@@ -294,11 +570,15 @@ def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[
             The name of the file to check.
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the copies when they don't match.
+        buffer (`dict`, *optional*):
+            The buffer used to store the previous results in order to speed up the process.
 
     Returns:
         `Optional[List[Tuple[str, int]]]`: If `overwrite=False`, returns the list of differences as tuples `(str, int)`
         with the name of the object having a diff and the line number where theere is the first diff.
     """
+    base_path = TRANSFORMERS_PATH if not filename.startswith("tests") else MODEL_TEST_PATH
+
     with open(filename, "r", encoding="utf-8", newline="\n") as f:
         lines = f.readlines()
     diffs = []
@@ -317,16 +597,31 @@ def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[
         # There is some copied code here, let's retrieve the original.
         indent, object_name, replace_pattern = search.groups()
 
-        base_path = TRANSFORMERS_PATH if not filename.startswith("tests") else MODEL_TEST_PATH
-        theoretical_code = find_code_in_transformers(object_name, base_path=base_path)
+        # Find the file lines, the object's code, and its blocks
+        target_lines, theoretical_code, theoretical_code_splits = find_code_and_splits(
+            object_name, base_path, buffer=buffer
+        )
+
+        # code replaced by the patterns
+        theoretical_code_blocks = OrderedDict()
+        for name, start, end in theoretical_code_splits:
+            name = replace_code(name, replace_pattern)
+            code = "".join(target_lines[start:end])
+            code = replace_code(code, replace_pattern)
+            theoretical_code_blocks[name] = code
+
         theoretical_indent = get_indent(theoretical_code)
 
+        # `start_index` is the index of the first line (the definition header) after `# Copied from`.
+        # (`indent != theoretical_indent` doesn't seem to occur so far, not sure what this case is for.)
         start_index = line_index + 1 if indent == theoretical_indent else line_index
+        # enter the block body
         line_index = start_index + 1
 
         subcode = "\n".join(theoretical_code.split("\n")[1:])
         indent = get_indent(subcode)
         # Loop to check the observed code, stop when indentation diminishes or if we see a End copy comment.
+        # We can't call `find_block_end` directly as there is sth. special `# End copy"` here.
         should_continue = True
         while line_index < len(lines) and should_continue:
             line_index += 1
@@ -336,33 +631,118 @@ def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[
             # There is a special pattern `# End copy` to stop early. It's not documented cause it shouldn't really be
             # used.
             should_continue = _should_continue(line, indent) and re.search(f"^{indent}# End copy", line) is None
+        # `line_index` is outside the block
         # Clean up empty lines at the end (if any).
         while len(lines[line_index - 1]) <= 1:
             line_index -= 1
 
-        observed_code_lines = lines[start_index:line_index]
-        observed_code = "".join(observed_code_lines)
-
-        # Before comparing, use the `replace_pattern` on the original code.
-        if len(replace_pattern) > 0:
-            patterns = replace_pattern.replace("with", "").split(",")
-            patterns = [_re_replace_pattern.search(p) for p in patterns]
-            for pattern in patterns:
-                if pattern is None:
-                    continue
-                obj1, obj2, option = pattern.groups()
-                theoretical_code = re.sub(obj1, obj2, theoretical_code)
-                if option.strip() == "all-casing":
-                    theoretical_code = re.sub(obj1.lower(), obj2.lower(), theoretical_code)
-                    theoretical_code = re.sub(obj1.upper(), obj2.upper(), theoretical_code)
+        # Split the observed code into blocks
+        observed_code_splits = split_code_into_blocks(lines, start_index, line_index, len(indent), backtrace=True)
+
+        is_class = lines[start_index].startswith(f"{' ' * (len(indent) - 4)}class ")
+        # sanity check
+        _sanity_check_splits(theoretical_code_splits, observed_code_splits, is_class=is_class)
+
+        # observed code in a structured way (a dict mapping block names to blocks' code)
+        observed_code_blocks = OrderedDict()
+        for name, start, end in observed_code_splits:
+            code = "".join(lines[start:end])
+            observed_code_blocks[name] = code
+
+        # Below, we change some names in `theoretical_code_blocks` and `observed_code_blocks`. These mappings map the
+        # original names to the modified names: this is used to restore the original order of the code blocks.
+        name_mappings_1 = {k: k for k in theoretical_code_blocks.keys()}
+        name_mappings_2 = {k: k for k in observed_code_blocks.keys()}
+
+        # Update code blocks' name and content:
+        #   If `"# Ignore copy"` is found in a block of the observed code:
+        #     1. if it's a block only in the observed code --> add it to the theoretical code.
+        #     2. if it's also in the theoretical code () --> put its content (body) to the corresponding block under the
+        #        same name in the theoretical code.
+        #   In both cases, we change the name to have a prefix `_ignored_` so we know if we can discard them during the
+        #   comparison.
+        ignored_existing_block_index = 0
+        ignored_new_block_index = 0
+        for name in list(observed_code_blocks.keys()):
+            code = observed_code_blocks[name]
+            if "# Ignore copy" in code:
+                if name in theoretical_code_blocks:
+                    # in the target --> just copy the content
+                    del theoretical_code_blocks[name]
+                    theoretical_code_blocks[f"_ignored_existing_block_{ignored_existing_block_index}"] = code
+                    name_mappings_1[name] = f"_ignored_existing_block_{ignored_existing_block_index}"
+
+                    del observed_code_blocks[name]
+                    observed_code_blocks[f"_ignored_existing_block_{ignored_existing_block_index}"] = code
+                    name_mappings_2[name] = f"_ignored_existing_block_{ignored_existing_block_index}"
+                    ignored_existing_block_index += 1
+                else:
+                    # not in the target --> add it
+                    theoretical_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
+                    name_mappings_1[
+                        f"_ignored_new_block_{ignored_new_block_index}"
+                    ] = f"_ignored_new_block_{ignored_new_block_index}"
+
+                    del observed_code_blocks[name]
+                    observed_code_blocks[f"_ignored_new_block_{ignored_new_block_index}"] = code
+                    name_mappings_2[name] = f"_ignored_new_block_{ignored_new_block_index}"
+                    ignored_new_block_index += 1
+
+        # Respect the original block order:
+        #   1. in `theoretical_code_blocks`: the new blocks will follow the existing ones
+        #   2. in `observed_code_blocks`: the original order are kept with names modified potentially. This is necessary
+        #      to compute the correct `diff_index` if `overwrite=True` and there is a diff.
+        theoretical_code_blocks = {
+            name_mappings_1[orig_name]: theoretical_code_blocks[name_mappings_1[orig_name]]
+            for orig_name in name_mappings_1
+        }
+        observed_code_blocks = {
+            name_mappings_2[orig_name]: observed_code_blocks[name_mappings_2[orig_name]]
+            for orig_name in name_mappings_2
+        }
+
+        # Ignore the blocks specified to be ignored. This is the version used to check if there is a mismatch
+        theoretical_code_blocks_clean = {
+            k: v
+            for k, v in theoretical_code_blocks.items()
+            if not (k.startswith(("_ignored_existing_block_", "_ignored_new_block_")))
+        }
+        theoretical_code = "".join(list(theoretical_code_blocks_clean.values()))
+
+        # stylify `theoretical_code` before compare (this is needed only when `replace_pattern` is not empty)
+        if replace_pattern:
             theoretical_code = stylify(theoretical_code)
+        # Remove `\n\n` in `theoretical_code` before compare (so no empty line)
+        while "\n\n" in theoretical_code:
+            theoretical_code = theoretical_code.replace("\n\n", "\n")
+
+        # Compute `observed_code` where we don't include any empty line + keep track the line index between the
+        # original/processed `observed_code` so we can have the correct `diff_index`.
+        idx_to_orig_idx_mapping_for_observed_code_lines = {}
+        idx = -1
+        orig_idx = -1
+        observed_code = ""
+        for name, code in observed_code_blocks.items():
+            if code.endswith("\n"):
+                code = code[:-1]
+            for code_line in code.split("\n"):
+                orig_idx += 1
+                if code_line.strip() and not name.startswith(("_ignored_existing_block_", "_ignored_new_block_")):
+                    idx += 1
+                    observed_code += code_line + "\n"
+                    idx_to_orig_idx_mapping_for_observed_code_lines[idx] = orig_idx
 
         # Test for a diff and act accordingly.
         diff_index = check_codes_match(observed_code, theoretical_code)
         if diff_index is not None:
+            # switch to the index in the original `observed_code` (i.e. before removing empty lines)
+            diff_index = idx_to_orig_idx_mapping_for_observed_code_lines[diff_index]
             diffs.append([object_name, diff_index + start_index + 1])
             if overwrite:
-                lines = lines[:start_index] + [theoretical_code] + lines[line_index:]
+                # `theoretical_code_to_write` is a single string but may have several lines.
+                theoretical_code_to_write = stylify("".join(list(theoretical_code_blocks.values())))
+                lines = lines[:start_index] + [theoretical_code_to_write] + lines[line_index:]
+                # Here we treat it as a single entry in `lines`.
                 line_index = start_index + 1
 
     if overwrite and len(diffs) > 0:
@@ -373,7 +753,7 @@ def is_copy_consistent(filename: str, overwrite: bool = False) -> Optional[List[
     return diffs
 
 
-def check_copies(overwrite: bool = False):
+def check_copies(overwrite: bool = False, file: str = None):
     """
     Check every file is copy-consistent with the original. Also check the model list in the main README and other
     READMEs are consistent.
@@ -381,14 +761,21 @@ def check_copies(overwrite: bool = False):
     Args:
         overwrite (`bool`, *optional*, defaults to `False`):
             Whether or not to overwrite the copies when they don't match.
+        file (`bool`, *optional*):
+            The path to a specific file to check and/or fix.
     """
-    all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
-    all_test_files = glob.glob(os.path.join(MODEL_TEST_PATH, "**/*.py"), recursive=True)
-    all_files = list(all_files) + list(all_test_files)
+    buffer = {}
+
+    if file is None:
+        all_files = glob.glob(os.path.join(TRANSFORMERS_PATH, "**/*.py"), recursive=True)
+        all_test_files = glob.glob(os.path.join(MODEL_TEST_PATH, "**/*.py"), recursive=True)
+        all_files = list(all_files) + list(all_test_files)
+    else:
+        all_files = [file]
 
     diffs = []
     for filename in all_files:
-        new_diffs = is_copy_consistent(filename, overwrite)
+        new_diffs = is_copy_consistent(filename, overwrite, buffer)
         diffs += [f"- {filename}: copy does not match {d[0]} at line {d[1]}" for d in new_diffs]
     if not overwrite and len(diffs) > 0:
         diff = "\n".join(diffs)
@@ -673,6 +1060,7 @@ def check_model_list_copy(overwrite: bool = False):
     "TimmBackbone",
     "Vision Encoder decoder",
     "VisionTextDualEncoder",
+    "CLIPVisionModel",
 ]
 
 # Template for new entries to add in the main README when we have missing models.
@@ -732,9 +1120,10 @@ def check_readme(overwrite: bool = False):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
+    parser.add_argument("--file", type=str, default=None, help="A specific file to check and/or fix")
     parser.add_argument("--fix_and_overwrite", action="store_true", help="Whether to fix inconsistencies.")
     args = parser.parse_args()
 
     check_readme(args.fix_and_overwrite)
-    check_copies(args.fix_and_overwrite)
+    check_copies(args.fix_and_overwrite, args.file)
     check_full_copies(args.fix_and_overwrite)
diff --git a/utils/check_docstrings.py b/utils/check_docstrings.py
index cddd04ac6516d1..b9a1f0fde1c0cf 100644
--- a/utils/check_docstrings.py
+++ b/utils/check_docstrings.py
@@ -233,6 +233,8 @@
     "FlaxGPTJModel",
     "FlaxGPTNeoForCausalLM",
     "FlaxGPTNeoModel",
+    "FlaxLlamaForCausalLM",
+    "FlaxLlamaModel",
     "FlaxMBartForConditionalGeneration",
     "FlaxMBartForQuestionAnswering",
     "FlaxMBartForSequenceClassification",
@@ -463,6 +465,7 @@
     "SamConfig",
     "SamPromptEncoderConfig",
     "SeamlessM4TConfig",  # use of unconventional markdown
+    "SeamlessM4Tv2Config",  # use of unconventional markdown
     "Seq2SeqTrainingArguments",
     "SpecialTokensMixin",
     "Speech2Text2Config",
@@ -738,7 +741,6 @@
     "TrainerState",
     "TrainingArguments",
     "TrajectoryTransformerConfig",
-    "TransfoXLConfig",
     "TranslationPipeline",
     "TvltImageProcessor",
     "UMT5Config",
diff --git a/utils/check_inits.py b/utils/check_inits.py
index 383aa9c5db1f39..b9a637e6354bba 100644
--- a/utils/check_inits.py
+++ b/utils/check_inits.py
@@ -330,6 +330,7 @@ def get_transformers_submodules() -> List[str]:
     "modeling_flax_pytorch_utils",
     "models.esm.openfold_utils",
     "modeling_attn_mask_utils",
+    "safetensors_conversion",
 ]
 
 
diff --git a/utils/check_repo.py b/utils/check_repo.py
index d740eefed01936..aac06276234632 100644
--- a/utils/check_repo.py
+++ b/utils/check_repo.py
@@ -76,6 +76,9 @@
     "Kosmos2TextModel",
     "Kosmos2TextForCausalLM",
     "Kosmos2VisionModel",
+    "SeamlessM4Tv2TextToUnitModel",
+    "SeamlessM4Tv2CodeHifiGan",
+    "SeamlessM4Tv2TextToUnitForConditionalGeneration",
 ]
 
 # Update this list for models that are not tested with a comment explaining the reason it should not be.
@@ -114,7 +117,7 @@
     "BridgeTowerTextModel",  # No need to test it as it is tested by BridgeTowerModel model.
     "BridgeTowerVisionModel",  # No need to test it as it is tested by BridgeTowerModel model.
     "BarkCausalModel",  # Building part of bigger (tested) model.
-    "BarkModel",  # Does not have a forward signature - generation tested with integration tests
+    "BarkModel",  # Does not have a forward signature - generation tested with integration tests.
     "SeamlessM4TTextToUnitModel",  # Building part of bigger (tested) model.
     "SeamlessM4TCodeHifiGan",  # Building part of bigger (tested) model.
     "SeamlessM4TTextToUnitForConditionalGeneration",  # Building part of bigger (tested) model.
@@ -185,6 +188,8 @@
     "TimeSeriesTransformerForPrediction",
     "InformerForPrediction",
     "AutoformerForPrediction",
+    "PatchTSTForPretraining",
+    "PatchTSTForPrediction",
     "JukeboxVQVAE",
     "JukeboxPrior",
     "SamModel",
@@ -246,6 +251,8 @@
     "Owlv2TextModel",
     "Owlv2VisionModel",
     "OwlViTForObjectDetection",
+    "PatchTSMixerForPrediction",
+    "PatchTSMixerForPretraining",
     "RagModel",
     "RagSequenceForGeneration",
     "RagTokenForGeneration",
@@ -293,6 +300,11 @@
     "SeamlessM4TTextToUnitForConditionalGeneration",
     "SeamlessM4TCodeHifiGan",
     "SeamlessM4TForSpeechToSpeech",  # no auto class for speech-to-speech
+    "TvpForVideoGrounding",
+    "SeamlessM4Tv2NARTextToUnitModel",
+    "SeamlessM4Tv2NARTextToUnitForConditionalGeneration",
+    "SeamlessM4Tv2CodeHifiGan",
+    "SeamlessM4Tv2ForSpeechToSpeech",  # no auto class for speech-to-speech
 ]
 
 # DO NOT edit this list!
@@ -396,13 +408,11 @@ def get_model_modules() -> List[str]:
         "modeling_flax_speech_encoder_decoder",
         "modeling_flax_vision_encoder_decoder",
         "modeling_timm_backbone",
-        "modeling_transfo_xl_utilities",
         "modeling_tf_auto",
         "modeling_tf_encoder_decoder",
         "modeling_tf_outputs",
         "modeling_tf_pytorch_utils",
         "modeling_tf_utils",
-        "modeling_tf_transfo_xl_utilities",
         "modeling_tf_vision_encoder_decoder",
         "modeling_vision_encoder_decoder",
     ]
@@ -964,6 +974,7 @@ def find_all_documented_objects() -> List[str]:
     "TensorFlowBenchmark",
     "TensorFlowBenchmarkArguments",
     "AutoBackbone",
+    "BeitBackbone",
     "BitBackbone",
     "ConvNextBackbone",
     "ConvNextV2Backbone",
diff --git a/utils/check_table.py b/utils/check_table.py
index 6417858db0964f..6eec4754032e8e 100644
--- a/utils/check_table.py
+++ b/utils/check_table.py
@@ -155,6 +155,7 @@ def _center_text(text: str, width: int) -> str:
     "HerBERT": "BERT",
     "LayoutXLM": "LayoutLMv2",
     "Llama2": "LLaMA",
+    "MADLAD-400": "T5",
     "MatCha": "Pix2Struct",
     "mBART-50": "mBART",
     "Megatron-GPT2": "OpenAI GPT-2",
@@ -170,6 +171,7 @@ def _center_text(text: str, width: int) -> str:
     "XLS-R": "Wav2Vec2",
     "XLSR-Wav2Vec2": "Wav2Vec2",
 }
+MODEL_NAMES_TO_IGNORE = ["CLIPVisionModel"]
 
 
 def get_model_table_from_auto_modules() -> str:
@@ -242,6 +244,8 @@ def get_model_table_from_auto_modules() -> str:
     check = {True: "✅", False: "❌"}
 
     for name in model_names:
+        if name in MODEL_NAMES_TO_IGNORE:
+            continue
         if name in MODEL_NAMES_WITH_SAME_CONFIG.keys():
             prefix = model_name_to_prefix[MODEL_NAMES_WITH_SAME_CONFIG[name]]
         else:
diff --git a/utils/create_dummy_models.py b/utils/create_dummy_models.py
index 5ee53ef7dab623..3a2c39de4c8f93 100644
--- a/utils/create_dummy_models.py
+++ b/utils/create_dummy_models.py
@@ -420,17 +420,25 @@ def get_tiny_config(config_class, model_class=None, **model_tester_kwargs):
         error = f"Tiny config not created for {model_type} - no model tester is found in the testing module."
         raise ValueError(error)
 
+    # CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
+    # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
+
+    if "vocab_size" in model_tester_kwargs:
+        if "text_kwargs" in inspect.signature(model_tester_class.__init__).parameters.keys():
+            vocab_size = model_tester_kwargs.pop("vocab_size")
+            model_tester_kwargs["text_kwargs"] = {"vocab_size": vocab_size}
+
     # `parent` is an instance of `unittest.TestCase`, but we don't need it here.
     model_tester = model_tester_class(parent=None, **model_tester_kwargs)
 
     if hasattr(model_tester, "get_pipeline_config"):
-        return model_tester.get_pipeline_config()
+        config = model_tester.get_pipeline_config()
     elif hasattr(model_tester, "prepare_config_and_inputs"):
         # `PoolFormer` has no `get_config` defined. Furthermore, it's better to use `prepare_config_and_inputs` even if
         # `get_config` is defined, since there might be some extra changes in `prepare_config_and_inputs`.
-        return model_tester.prepare_config_and_inputs()[0]
+        config = model_tester.prepare_config_and_inputs()[0]
     elif hasattr(model_tester, "get_config"):
-        return model_tester.get_config()
+        config = model_tester.get_config()
     else:
         error = (
             f"Tiny config not created for {model_type} - the model tester {model_tester_class.__name__} lacks"
@@ -438,6 +446,26 @@ def get_tiny_config(config_class, model_class=None, **model_tester_kwargs):
         )
         raise ValueError(error)
 
+    # make sure this is long enough (some model tester has `20` for this attr.) to pass `text-generation`
+    # pipeline tests.
+    max_positions = []
+    for key in ["max_position_embeddings", "max_source_positions", "max_target_positions"]:
+        if getattr(config, key, 0) > 0:
+            max_positions.append(getattr(config, key))
+        if getattr(config, "text_config", None) is not None:
+            if getattr(config.text_config, key, None) is not None:
+                max_positions.append(getattr(config.text_config, key))
+    if len(max_positions) > 0:
+        max_position = max(200, min(max_positions))
+        for key in ["max_position_embeddings", "max_source_positions", "max_target_positions"]:
+            if getattr(config, key, 0) > 0:
+                setattr(config, key, max_position)
+            if getattr(config, "text_config", None) is not None:
+                if getattr(config.text_config, key, None) is not None:
+                    setattr(config.text_config, key, max_position)
+
+    return config
+
 
 def convert_tokenizer(tokenizer_fast: PreTrainedTokenizerFast):
     new_tokenizer = tokenizer_fast.train_new_from_iterator(
@@ -1006,26 +1034,8 @@ def get_config_overrides(config_class, processors):
 
     # Used to create a new model tester with `tokenizer.vocab_size` in order to get the (updated) special token ids.
     model_tester_kwargs = {"vocab_size": vocab_size}
-    # CLIP-like models have `text_model_tester` and `vision_model_tester`, and we need to pass `vocab_size` to
-    # `text_model_tester` via `text_kwargs`. The same trick is also necessary for `Flava`.
-    if config_class.__name__ in [
-        "AlignConfig",
-        "AltCLIPConfig",
-        "ChineseCLIPConfig",
-        "CLIPSegConfig",
-        "ClapConfig",
-        "CLIPConfig",
-        "GroupViTConfig",
-        "OwlViTConfig",
-        "XCLIPConfig",
-        "FlavaConfig",
-        "BlipConfig",
-        "Blip2Config",
-    ]:
-        del model_tester_kwargs["vocab_size"]
-        model_tester_kwargs["text_kwargs"] = {"vocab_size": vocab_size}
     # `FSMTModelTester` accepts `src_vocab_size` and `tgt_vocab_size` but not `vocab_size`.
-    elif config_class.__name__ == "FSMTConfig":
+    if config_class.__name__ == "FSMTConfig":
         del model_tester_kwargs["vocab_size"]
         model_tester_kwargs["src_vocab_size"] = tokenizer.src_vocab_size
         model_tester_kwargs["tgt_vocab_size"] = tokenizer.tgt_vocab_size
@@ -1158,7 +1168,9 @@ def build(config_class, models_to_create, output_dir):
         if hasattr(tiny_config, k):
             setattr(tiny_config, k, v)
         # So far, we only have to deal with `text_config`, as `config_overrides` contains text-related attributes only.
-        elif (
+        # `FuyuConfig` saves data under both FuyuConfig and its `text_config`. This is not good, but let's just update
+        # every involved fields to avoid potential failure.
+        if (
             hasattr(tiny_config, "text_config")
             and tiny_config.text_config is not None
             and hasattr(tiny_config.text_config, k)
diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt
index 07775fe823a4dd..2cd16a0283e0ce 100644
--- a/utils/not_doctested.txt
+++ b/utils/not_doctested.txt
@@ -146,11 +146,13 @@ docs/source/en/model_doc/levit.md
 docs/source/en/model_doc/lilt.md
 docs/source/en/model_doc/llama.md
 docs/source/en/model_doc/llama2.md
+docs/source/en/model_doc/llava.md
 docs/source/en/model_doc/longformer.md
 docs/source/en/model_doc/longt5.md
 docs/source/en/model_doc/luke.md
 docs/source/en/model_doc/lxmert.md
 docs/source/en/model_doc/m2m_100.md
+docs/source/en/model_doc/madlad-400.md
 docs/source/en/model_doc/marian.md
 docs/source/en/model_doc/mask2former.md
 docs/source/en/model_doc/maskformer.md
@@ -237,6 +239,7 @@ docs/source/en/model_doc/upernet.md
 docs/source/en/model_doc/van.md
 docs/source/en/model_doc/videomae.md
 docs/source/en/model_doc/vilt.md
+docs/source/en/model_doc/vipllava.md
 docs/source/en/model_doc/vision-encoder-decoder.md
 docs/source/en/model_doc/vision-text-dual-encoder.md
 docs/source/en/model_doc/visual_bert.md
@@ -293,7 +296,7 @@ docs/source/en/serialization.md
 docs/source/en/tasks/asr.md
 docs/source/en/tasks/audio_classification.md
 docs/source/en/tasks/document_question_answering.md
-docs/source/en/tasks/idefics.md  # causes other tests to fail
+docs/source/en/tasks/idefics.md
 docs/source/en/tasks/image_captioning.md
 docs/source/en/tasks/image_classification.md
 docs/source/en/tasks/language_modeling.md
@@ -431,7 +434,7 @@ src/transformers/models/blip/modeling_blip_text.py
 src/transformers/models/blip/modeling_tf_blip_text.py
 src/transformers/models/blip_2/configuration_blip_2.py
 src/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py
-src/transformers/models/blip_2/modeling_blip_2.py  # causes other tests to fail
+src/transformers/models/blip_2/modeling_blip_2.py
 src/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py
 src/transformers/models/bloom/modeling_bloom.py
 src/transformers/models/bloom/modeling_flax_bloom.py
@@ -496,6 +499,11 @@ src/transformers/models/deprecated/tapex/tokenization_tapex.py
 src/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
 src/transformers/models/deprecated/trajectory_transformer/convert_trajectory_transformer_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
+src/transformers/models/deprecated/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
+src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
+src/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
+src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
+src/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
 src/transformers/models/deprecated/van/configuration_van.py
 src/transformers/models/deprecated/van/convert_van_to_pytorch.py
 src/transformers/models/deprecated/van/modeling_van.py
@@ -628,6 +636,8 @@ src/transformers/models/lilt/configuration_lilt.py
 src/transformers/models/llama/configuration_llama.py
 src/transformers/models/llama/convert_llama_weights_to_hf.py
 src/transformers/models/llama/modeling_llama.py
+src/transformers/models/llava/configuration_llava.py
+src/transformers/models/llava/modeling_llava.py
 src/transformers/models/longformer/configuration_longformer.py
 src/transformers/models/longformer/convert_longformer_original_pytorch_lightning_to_pytorch.py
 src/transformers/models/longt5/configuration_longt5.py
@@ -668,6 +678,8 @@ src/transformers/models/mgp_str/configuration_mgp_str.py
 src/transformers/models/mgp_str/modeling_mgp_str.py
 src/transformers/models/mistral/configuration_mistral.py
 src/transformers/models/mistral/modeling_mistral.py
+src/transformers/models/mixtral/configuration_mixtral.py
+src/transformers/models/mixtral/modeling_mixtral.py
 src/transformers/models/mluke/convert_mluke_original_pytorch_checkpoint_to_pytorch.py
 src/transformers/models/mobilebert/convert_mobilebert_original_tf_checkpoint_to_pytorch.py
 src/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -770,6 +782,7 @@ src/transformers/models/sam/modeling_sam.py
 src/transformers/models/sam/modeling_tf_sam.py
 src/transformers/models/sam/processing_sam.py
 src/transformers/models/seamless_m4t/convert_fairseq2_to_hf.py
+src/transformers/models/seamless_m4t_v2/convert_fairseq2_to_hf.py
 src/transformers/models/segformer/configuration_segformer.py
 src/transformers/models/segformer/convert_segformer_original_to_pytorch.py
 src/transformers/models/sew/convert_sew_original_pytorch_checkpoint_to_pytorch.py
@@ -818,11 +831,6 @@ src/transformers/models/tapas/modeling_tf_tapas.py
 src/transformers/models/timesformer/convert_timesformer_to_pytorch.py
 src/transformers/models/timm_backbone/configuration_timm_backbone.py
 src/transformers/models/timm_backbone/modeling_timm_backbone.py
-src/transformers/models/transfo_xl/convert_transfo_xl_original_tf_checkpoint_to_pytorch.py
-src/transformers/models/transfo_xl/modeling_tf_transfo_xl.py
-src/transformers/models/transfo_xl/modeling_tf_transfo_xl_utilities.py
-src/transformers/models/transfo_xl/modeling_transfo_xl.py
-src/transformers/models/transfo_xl/modeling_transfo_xl_utilities.py
 src/transformers/models/trocr/convert_trocr_unilm_to_pytorch.py
 src/transformers/models/tvlt/configuration_tvlt.py
 src/transformers/models/tvlt/modeling_tvlt.py
@@ -840,6 +848,8 @@ src/transformers/models/videomae/configuration_videomae.py
 src/transformers/models/videomae/convert_videomae_to_pytorch.py
 src/transformers/models/vilt/configuration_vilt.py
 src/transformers/models/vilt/convert_vilt_original_to_pytorch.py
+src/transformers/models/vipllava/configuration_vipllava.py
+src/transformers/models/vipllava/modeling_vipllava.py
 src/transformers/models/vision_encoder_decoder/modeling_flax_vision_encoder_decoder.py
 src/transformers/models/vision_encoder_decoder/modeling_tf_vision_encoder_decoder.py
 src/transformers/models/vision_text_dual_encoder/modeling_flax_vision_text_dual_encoder.py
@@ -990,4 +1000,4 @@ src/transformers/utils/peft_utils.py
 src/transformers/utils/quantization_config.py
 src/transformers/utils/sentencepiece_model_pb2.py
 src/transformers/utils/sentencepiece_model_pb2_new.py
-src/transformers/utils/versions.py
\ No newline at end of file
+src/transformers/utils/versions.py
diff --git a/utils/notification_service.py b/utils/notification_service.py
index 4df3b299448475..969107b3f88481 100644
--- a/utils/notification_service.py
+++ b/utils/notification_service.py
@@ -100,7 +100,13 @@ def dicts_to_sum(objects: Union[Dict[str, Dict], List[dict]]):
 
 class Message:
     def __init__(
-        self, title: str, ci_title: str, model_results: Dict, additional_results: Dict, selected_warnings: List = None
+        self,
+        title: str,
+        ci_title: str,
+        model_results: Dict,
+        additional_results: Dict,
+        selected_warnings: List = None,
+        prev_ci_artifacts=None,
     ):
         self.title = title
         self.ci_title = ci_title
@@ -119,10 +125,17 @@ def __init__(
         # Failures and success of the additional tests
         self.n_additional_success = sum(r["success"] for r in additional_results.values())
 
-        all_additional_failures = dicts_to_sum([r["failed"] for r in additional_results.values()])
-        self.n_additional_single_gpu_failures = all_additional_failures["single"]
-        self.n_additional_multi_gpu_failures = all_additional_failures["multi"]
-        self.n_additional_unknown_gpu_failures = all_additional_failures["unclassified"]
+        if len(additional_results) > 0:
+            # `dicts_to_sum` uses `dicts_to_sum` which requires a non empty dictionary. Let's just add an empty entry.
+            all_additional_failures = dicts_to_sum([r["failed"] for r in additional_results.values()])
+            self.n_additional_single_gpu_failures = all_additional_failures["single"]
+            self.n_additional_multi_gpu_failures = all_additional_failures["multi"]
+            self.n_additional_unknown_gpu_failures = all_additional_failures["unclassified"]
+        else:
+            self.n_additional_single_gpu_failures = 0
+            self.n_additional_multi_gpu_failures = 0
+            self.n_additional_unknown_gpu_failures = 0
+
         self.n_additional_failures = (
             self.n_additional_single_gpu_failures
             + self.n_additional_multi_gpu_failures
@@ -143,6 +156,8 @@ def __init__(
             selected_warnings = []
         self.selected_warnings = selected_warnings
 
+        self.prev_ci_artifacts = prev_ci_artifacts
+
     @property
     def time(self) -> str:
         all_results = [*self.model_results.values(), *self.additional_results.values()]
@@ -391,8 +406,6 @@ def per_model_sum(model_category_dict):
 
         # Save the complete (i.e. no truncation) failure tables (of the current workflow run)
         # (to be uploaded as artifacts)
-        if not os.path.isdir(os.path.join(os.getcwd(), "test_failure_tables")):
-            os.makedirs(os.path.join(os.getcwd(), "test_failure_tables"))
 
         model_failures_report = prepare_reports(
             title="These following model modules had failures",
@@ -400,7 +413,7 @@ def per_model_sum(model_category_dict):
             reports=sorted_model_reports,
             to_truncate=False,
         )
-        file_path = os.path.join(os.getcwd(), "test_failure_tables/model_failures_report.txt")
+        file_path = os.path.join(os.getcwd(), "prev_ci_results/model_failures_report.txt")
         with open(file_path, "w", encoding="UTF-8") as fp:
             fp.write(model_failures_report)
 
@@ -410,27 +423,18 @@ def per_model_sum(model_category_dict):
             reports=sorted_module_reports,
             to_truncate=False,
         )
-        file_path = os.path.join(os.getcwd(), "test_failure_tables/module_failures_report.txt")
+        file_path = os.path.join(os.getcwd(), "prev_ci_results/module_failures_report.txt")
         with open(file_path, "w", encoding="UTF-8") as fp:
             fp.write(module_failures_report)
 
-        target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
-        if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
-            # Get the last previously completed CI's failure tables
-            artifact_names = ["test_failure_tables"]
-            output_dir = os.path.join(os.getcwd(), "previous_reports")
-            os.makedirs(output_dir, exist_ok=True)
-            prev_tables = get_last_daily_ci_reports(
-                artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
-            )
-
-            # if the last run produces artifact named `test_failure_tables`
+        if self.prev_ci_artifacts is not None:
+            # if the last run produces artifact named `prev_ci_results`
             if (
-                "test_failure_tables" in prev_tables
-                and "model_failures_report.txt" in prev_tables["test_failure_tables"]
+                "prev_ci_results" in self.prev_ci_artifacts
+                and "model_failures_report.txt" in self.prev_ci_artifacts["prev_ci_results"]
             ):
                 # Compute the difference of the previous/current (model failure) table
-                prev_model_failures = prev_tables["test_failure_tables"]["model_failures_report.txt"]
+                prev_model_failures = self.prev_ci_artifacts["prev_ci_results"]["model_failures_report.txt"]
                 entries_changed = self.compute_diff_for_failure_reports(model_failures_report, prev_model_failures)
                 if len(entries_changed) > 0:
                     # Save the complete difference
@@ -440,7 +444,7 @@ def per_model_sum(model_category_dict):
                         reports=entries_changed,
                         to_truncate=False,
                     )
-                    file_path = os.path.join(os.getcwd(), "test_failure_tables/changed_model_failures_report.txt")
+                    file_path = os.path.join(os.getcwd(), "prev_ci_results/changed_model_failures_report.txt")
                     with open(file_path, "w", encoding="UTF-8") as fp:
                         fp.write(diff_report)
 
@@ -506,6 +510,10 @@ def payload(self) -> str:
         if len(self.selected_warnings) > 0:
             blocks.append(self.warnings)
 
+        new_failure_blocks = self.get_new_model_failure_blocks(with_header=False)
+        if len(new_failure_blocks) > 0:
+            blocks.extend(new_failure_blocks)
+
         return json.dumps(blocks)
 
     @staticmethod
@@ -625,6 +633,67 @@ def get_reply_blocks(self, job_name, job_result, failures, device, text):
             {"type": "section", "text": {"type": "mrkdwn", "text": failure_text}},
         ]
 
+    def get_new_model_failure_blocks(self, with_header=True):
+        if self.prev_ci_artifacts is None:
+            return {}
+
+        sorted_dict = sorted(self.model_results.items(), key=lambda t: t[0])
+
+        prev_model_results = {}
+        if (
+            "prev_ci_results" in self.prev_ci_artifacts
+            and "model_results.json" in self.prev_ci_artifacts["prev_ci_results"]
+        ):
+            prev_model_results = json.loads(self.prev_ci_artifacts["prev_ci_results"]["model_results.json"])
+
+        all_failure_lines = {}
+        for job, job_result in sorted_dict:
+            if len(job_result["failures"]):
+                devices = sorted(job_result["failures"].keys(), reverse=True)
+                for device in devices:
+                    failures = job_result["failures"][device]
+                    prev_error_lines = {}
+                    if job in prev_model_results and device in prev_model_results[job]["failures"]:
+                        prev_error_lines = {error["line"] for error in prev_model_results[job]["failures"][device]}
+
+                    url = None
+                    if job_result["job_link"] is not None and job_result["job_link"][device] is not None:
+                        url = job_result["job_link"][device]
+
+                    for idx, error in enumerate(failures):
+                        if error["line"] in prev_error_lines:
+                            continue
+
+                        new_text = f'{error["line"]}\n\n'
+
+                        if new_text not in all_failure_lines:
+                            all_failure_lines[new_text] = []
+
+                        all_failure_lines[new_text].append(f"<{url}|{device}>" if url is not None else device)
+
+        MAX_ERROR_TEXT = 3000 - len("[Truncated]") - len("```New model failures```\n\n")
+        failure_text = ""
+        for line, devices in all_failure_lines.items():
+            new_text = failure_text + f"{'|'.join(devices)} gpu\n{line}"
+            if len(new_text) > MAX_ERROR_TEXT:
+                # `failure_text` here has length <= 3000
+                failure_text = failure_text + "[Truncated]"
+                break
+            # `failure_text` here has length <= MAX_ERROR_TEXT
+            failure_text = new_text
+
+        blocks = []
+        if failure_text:
+            if with_header:
+                blocks.append(
+                    {"type": "header", "text": {"type": "plain_text", "text": "New model failures", "emoji": True}}
+                )
+            else:
+                failure_text = f"*New model failures*\n\n{failure_text}"
+            blocks.append({"type": "section", "text": {"type": "mrkdwn", "text": failure_text}})
+
+        return blocks
+
     def post_reply(self):
         if self.thread_ts is None:
             raise ValueError("Can only post reply if a post has been made.")
@@ -674,6 +743,20 @@ def post_reply(self):
 
                     time.sleep(1)
 
+        blocks = self.get_new_model_failure_blocks()
+        if blocks:
+            print("Sending the following reply")
+            print(json.dumps({"blocks": blocks}))
+
+            client.chat_postMessage(
+                channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"],
+                text="Results for new failures",
+                blocks=blocks,
+                thread_ts=self.thread_ts["ts"],
+            )
+
+            time.sleep(1)
+
 
 def retrieve_artifact(artifact_path: str, gpu: Optional[str]):
     if gpu not in [None, "single", "multi"]:
@@ -903,6 +986,9 @@ def prepare_reports(title, header, reports, to_truncate=True):
     elif ci_event.startswith("Push CI (AMD) - "):
         flavor = ci_event.replace("Push CI (AMD) - ", "")
         job_name_prefix = f"AMD {flavor}"
+    elif ci_event.startswith("Scheduled CI (AMD) - "):
+        flavor = ci_event.replace("Scheduled CI (AMD) - ", "")
+        job_name_prefix = f"AMD {flavor}"
 
     for model in model_results.keys():
         for artifact_path in available_artifacts[f"run_all_tests_gpu_{model}_test_reports"].paths:
@@ -968,10 +1054,15 @@ def prepare_reports(title, header, reports, to_truncate=True):
         "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports",
     }
 
-    if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI") or ci_event.startswith("Push CI (AMD)"):
+    if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"):
         del additional_files["Examples directory"]
         del additional_files["PyTorch pipelines"]
         del additional_files["TensorFlow pipelines"]
+    elif ci_event.startswith("Scheduled CI (AMD)"):
+        del additional_files["TensorFlow pipelines"]
+        del additional_files["Torch CUDA extension tests"]
+    elif ci_event.startswith("Push CI (AMD)"):
+        additional_files = {}
 
     additional_results = {
         key: {
@@ -1030,7 +1121,31 @@ def prepare_reports(title, header, reports, to_truncate=True):
         with open(os.path.join(directory, "selected_warnings.json")) as fp:
             selected_warnings = json.load(fp)
 
-    message = Message(title, ci_title, model_results, additional_results, selected_warnings=selected_warnings)
+    if not os.path.isdir(os.path.join(os.getcwd(), "prev_ci_results")):
+        os.makedirs(os.path.join(os.getcwd(), "prev_ci_results"))
+
+    with open("prev_ci_results/model_results.json", "w", encoding="UTF-8") as fp:
+        json.dump(model_results, fp, indent=4, ensure_ascii=False)
+
+    prev_ci_artifacts = None
+    target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main"
+    if os.environ.get("CI_WORKFLOW_REF") == target_workflow:
+        # Get the last previously completed CI's failure tables
+        artifact_names = ["prev_ci_results"]
+        output_dir = os.path.join(os.getcwd(), "previous_reports")
+        os.makedirs(output_dir, exist_ok=True)
+        prev_ci_artifacts = get_last_daily_ci_reports(
+            artifact_names=artifact_names, output_dir=output_dir, token=os.environ["ACCESS_REPO_INFO_TOKEN"]
+        )
+
+    message = Message(
+        title,
+        ci_title,
+        model_results,
+        additional_results,
+        selected_warnings=selected_warnings,
+        prev_ci_artifacts=prev_ci_artifacts,
+    )
 
     # send report only if there is any failure (for push CI)
     if message.n_failures or (ci_event != "push" and not ci_event.startswith("Push CI (AMD)")):
diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt
index 64841a19cb6cf5..302778c7320039 100644
--- a/utils/slow_documentation_tests.txt
+++ b/utils/slow_documentation_tests.txt
@@ -2,6 +2,7 @@ docs/source/en/generation_strategies.md
 docs/source/en/model_doc/ctrl.md
 docs/source/en/model_doc/kosmos-2.md
 docs/source/en/model_doc/seamless_m4t.md
+docs/source/en/model_doc/seamless_m4t_v2.md
 docs/source/en/task_summary.md
 docs/source/en/tasks/prompting.md
 src/transformers/models/blip_2/modeling_blip_2.py
diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py
index c7638a129a0c25..1aae59e4b33602 100644
--- a/utils/tests_fetcher.py
+++ b/utils/tests_fetcher.py
@@ -51,9 +51,11 @@
 
 import argparse
 import collections
+import importlib.util
 import json
 import os
 import re
+import tempfile
 from contextlib import contextmanager
 from pathlib import Path
 from typing import Dict, List, Optional, Tuple, Union
@@ -254,6 +256,122 @@ def diff_contains_doc_examples(repo: Repo, branching_point: str, filename: str)
     return old_content_clean != new_content_clean
 
 
+def get_impacted_files_from_tiny_model_summary(diff_with_last_commit: bool = False) -> List[str]:
+    """
+    Return a list of python modeling files that are impacted by the changes of `tiny_model_summary.json` in between:
+
+    - the current head and the main branch if `diff_with_last_commit=False` (default)
+    - the current head and its parent commit otherwise.
+
+    Returns:
+        `List[str]`: The list of Python modeling files that are impacted by the changes of `tiny_model_summary.json`.
+    """
+    repo = Repo(PATH_TO_REPO)
+
+    folder = Path(repo.working_dir)
+
+    if not diff_with_last_commit:
+        print(f"main is at {repo.refs.main.commit}")
+        print(f"Current head is at {repo.head.commit}")
+
+        commits = repo.merge_base(repo.refs.main, repo.head)
+        for commit in commits:
+            print(f"Branching commit: {commit}")
+    else:
+        print(f"main is at {repo.head.commit}")
+        commits = repo.head.commit.parents
+        for commit in commits:
+            print(f"Parent commit: {commit}")
+
+    if not os.path.isfile(folder / "tests/utils/tiny_model_summary.json"):
+        return []
+
+    files = set()
+    for commit in commits:
+        with checkout_commit(repo, commit):
+            with open(folder / "tests/utils/tiny_model_summary.json", "r", encoding="utf-8") as f:
+                old_content = f.read()
+
+        with open(folder / "tests/utils/tiny_model_summary.json", "r", encoding="utf-8") as f:
+            new_content = f.read()
+
+        # get the content as json object
+        old_content = json.loads(old_content)
+        new_content = json.loads(new_content)
+
+        old_keys = set(old_content.keys())
+        new_keys = set(new_content.keys())
+
+        # get the difference
+        keys_with_diff = old_keys.symmetric_difference(new_keys)
+        common_keys = old_keys.intersection(new_keys)
+        # if both have the same key, check its content
+        for key in common_keys:
+            if old_content[key] != new_content[key]:
+                keys_with_diff.add(key)
+
+        # get the model classes
+        impacted_model_classes = []
+        for key in keys_with_diff:
+            if key in new_keys:
+                impacted_model_classes.extend(new_content[key]["model_classes"])
+
+        # get the module where the model classes are defined. We want to use the main `__init__` file, but it requires
+        # all the framework being installed, which is not ideal for a simple script like test fetcher.
+        # So we create a temporary and modified main `__init__` and access its `_import_structure`.
+        with open(folder / "src/transformers/__init__.py") as fp:
+            lines = fp.readlines()
+            new_lines = []
+            # Get all the code related to `_import_structure`
+            for line in lines:
+                if line == "_import_structure = {\n":
+                    new_lines.append(line)
+                elif line == "# Direct imports for type-checking\n":
+                    break
+                elif len(new_lines) > 0:
+                    # bypass the framework check so we can get all the information even if frameworks are not available
+                    line = re.sub(r"is_.+_available\(\)", "True", line)
+                    line = line.replace("OptionalDependencyNotAvailable", "Exception")
+                    line = line.replace("Exception()", "Exception")
+                    new_lines.append(line)
+
+        # create and load the temporary module
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            with open(os.path.join(tmpdirname, "temp_init.py"), "w") as fp:
+                fp.write("".join(new_lines))
+
+            spec = importlib.util.spec_from_file_location("temp_init", os.path.join(tmpdirname, "temp_init.py"))
+            module = importlib.util.module_from_spec(spec)
+            spec.loader.exec_module(module)
+            # Finally, get `_import_structure` that we need
+            import_structure = module._import_structure
+
+            # map model classes to their defined module
+            reversed_structure = {}
+            for key, values in import_structure.items():
+                for value in values:
+                    reversed_structure[value] = key
+
+            # Get the corresponding modeling file path
+            for model_class in impacted_model_classes:
+                module = reversed_structure[model_class]
+                framework = ""
+                if model_class.startswith("TF"):
+                    framework = "tf"
+                elif model_class.startswith("Flax"):
+                    framework = "flax"
+                fn = (
+                    f"modeling_{module.split('.')[-1]}.py"
+                    if framework == ""
+                    else f"modeling_{framework}_{module.split('.')[-1]}.py"
+                )
+                files.add(
+                    f"src.transformers.{module}.{fn}".replace(".", os.path.sep).replace(f"{os.path.sep}py", ".py")
+                )
+
+    return sorted(files)
+
+
 def get_diff(repo: Repo, base_commit: str, commits: List[str]) -> List[str]:
     """
     Get the diff between a base commit and one or several commits.
@@ -949,18 +1067,16 @@ def infer_tests_to_run(
     if any(x in modified_files for x in ["setup.py", ".circleci/create_circleci_config.py"]):
         test_files_to_run = ["tests", "examples"]
         repo_utils_launch = True
-    # in order to trigger pipeline tests even if no code change at all
-    elif "tests/utils/tiny_model_summary.json" in modified_files:
-        test_files_to_run = ["tests"]
-        repo_utils_launch = any(f.split(os.path.sep)[0] == "utils" for f in modified_files)
     else:
         # All modified tests need to be run.
         test_files_to_run = [
             f for f in modified_files if f.startswith("tests") and f.split(os.path.sep)[-1].startswith("test")
         ]
+        impacted_files = get_impacted_files_from_tiny_model_summary(diff_with_last_commit=diff_with_last_commit)
+
         # Then we grab the corresponding test files.
         test_map = create_module_to_test_map(reverse_map=reverse_map, filter_models=filter_models)
-        for f in modified_files:
+        for f in modified_files + impacted_files:
             if f in test_map:
                 test_files_to_run.extend(test_map[f])
         test_files_to_run = sorted(set(test_files_to_run))